bio.std.range.splitter source code

1 /*
2     This file is part of BioD.
3 
4     Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl>
5 */
6 
7 module bio.std.range.splitter;
8 
9 import std.algorithm;
10 import std.array;
11 import std.conv;
12 import std.exception;
13 import std.stdio;
14 
15 import std.range.primitives;
16 
17 immutable ubyte[] SPLIT_ON = [ 0x20, 0x09, 0x0A, ';', ',' ];
18 
19 /**
20    SimpleSplitConv takes a range R (typically a text line) and splits
21    it/tokenizes it on a list of characters. Essentially fields/tokens
22    are split by tabs, semi-colons or comma's and spaces. This compares
23    to C's strtok(str, ", \t;").
24 
25    This routine happens often in bioinformatics and is a replacement
26    for the much unsafer C strtok.  This edition should also handle
27    UTF.
28 
29    The default is to split on space, newline, tab, semi-colon and
30    comma.
31 */
32 
33 struct SimpleSplitConv(R)
34   if (isInputRange!R)
35 {
36   R list, split_on;
37 
38   this(R range, R splits_on = cast(R)SPLIT_ON) {
39     list = range;
40     split_on = splits_on;
41   }
42 
43   int opApply(scope int delegate(R) dg) {
44     size_t start = 0;
45     bool in_whitespace = false;
46     foreach(size_t pos, c; list) {
47       if (canFind(split_on,c)) { // hit split char
48         if (!in_whitespace) { // emit
49           auto token = list[start..pos];
50           dg(token);
51         }
52         start = pos+1;
53         in_whitespace = true;
54       } else {
55         in_whitespace = false;
56       }
57     }
58     return 0;
59   }
60 }
61 
62 unittest {
63   auto s = cast(ubyte[])"hello 1 2 \t3  4 \n";
64   assert(array(SimpleSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]);
65   assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"  hello, 1 2 \t3  4 \n")) == ["","hello","1","2","3","4"]);
66   assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3  4 \n")) == ["hello","1","2","3","4"]);
67 }