1 /* 2 This file is part of BioD. 3 4 Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl> 5 */ 6 7 module bio.std.range.splitter; 8 9 import std.algorithm; 10 import std.array; 11 import std.conv; 12 import std.exception; 13 import std.stdio; 14 15 import std.range.primitives; 16 17 immutable ubyte[] SPLIT_ON = [ 0x20, 0x09, 0x0A, ';', ',' ]; 18 19 /** 20 SimpleSplitConv takes a range R (typically a text line) and splits 21 it/tokenizes it on a list of characters. Essentially fields/tokens 22 are split by tabs, semi-colons or comma's and spaces. This compares 23 to C's strtok(str, ", \t;"). 24 25 This routine happens often in bioinformatics and is a replacement 26 for the much unsafer C strtok. This edition should also handle 27 UTF. 28 29 The default is to split on space, newline, tab, semi-colon and 30 comma. 31 */ 32 33 struct SimpleSplitConv(R) 34 if (isInputRange!R) 35 { 36 R list, split_on; 37 38 this(R range, R splits_on = cast(R)SPLIT_ON) { 39 list = range; 40 split_on = splits_on; 41 } 42 43 int opApply(scope int delegate(R) dg) { 44 size_t start = 0; 45 bool in_whitespace = false; 46 foreach(size_t pos, c; list) { 47 if (canFind(split_on,c)) { // hit split char 48 if (!in_whitespace) { // emit 49 auto token = list[start..pos]; 50 dg(token); 51 } 52 start = pos+1; 53 in_whitespace = true; 54 } else { 55 in_whitespace = false; 56 } 57 } 58 return 0; 59 } 60 } 61 62 unittest { 63 auto s = cast(ubyte[])"hello 1 2 \t3 4 \n"; 64 assert(array(SimpleSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]); 65 assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])" hello, 1 2 \t3 4 \n")) == ["","hello","1","2","3","4"]); 66 assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3 4 \n")) == ["hello","1","2","3","4"]); 67 }