bio.std.range.splitter source code

1 /*
2     This file is part of BioD.
3 
4     Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl>
5 */
6 
7 module bio.std.range.splitter;
8 
9 import std.algorithm;
10 import std.array;
11 import std.conv;
12 import std.exception;
13 import std.stdio;
14 
15 import std.range.primitives;
16 
17 immutable ubyte[] SPLIT_ON = [ 0x20, 0x09, 0x0A, ';', ',' ];
18 
19 /**
20    SimpleSplitConv takes a range R (typically a text line) and splits
21    it/tokenizes it on a list of characters. Essentially fields/tokens
22    are split by tabs, semi-colons or comma's and spaces. This compares
23    to C's strtok(str, ", \t;").
24 
25    This routine happens often in bioinformatics and is a replacement
26    for the much unsafer C strtok.  This edition should also handle
27    UTF.
28 
29    The default is to split on space, newline, tab, semi-colon and
30    comma.
31 */
32 
33 struct SimpleSplitConv(R)
34   if (isInputRange!R)
35 {
36   R list, split_on;
37 
38   this(R range, R splits_on = cast(R)SPLIT_ON) {
39     list = range;
40     split_on = splits_on;
41   }
42 
43   int opApply(scope int delegate(R) dg) {
44     size_t start = 0;
45     bool in_whitespace = false;
46     foreach(size_t pos, c; list) {
47       if (canFind(split_on,c)) { // hit split char
48         if (!in_whitespace) { // emit
49           auto token = list[start..pos];
50           dg(token);
51         }
52         start = pos+1;
53         in_whitespace = true;
54       } else {
55         in_whitespace = false;
56       }
57     }
58     if (!in_whitespace) { // emit final
59       auto token = list[start..$];
60       dg(token);
61     }
62     return 0;
63   }
64 }
65 
66 unittest {
67   auto s = cast(ubyte[])"hello 1 2 \t3  4 \n";
68   assert(array(SimpleSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]);
69   assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"  hello, 1 2 \t3  4 \n")) == ["","hello","1","2","3","4"]);
70   assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3  4 \n")) == ["hello","1","2","3","4"]);
71   assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"chr1:55365,55365,1")) == ["chr1:55365","55365","1"]);
72 }
73 
74 /*
75    Dirty fast_splitter is 3x faster than above elegant version. It does no heap
76    allocations.
77 */
78 R[] fast_splitter(R)(R[] tokens, R range, R splits_on = cast(R)SPLIT_ON) @nogc {
79   // R[] tokens = new R[range.length]; // pre-allocate optimistially
80   auto j = 0, prev_j = 0;
81   bool in_whitespace = false;
82   auto token_num = 0;
83   for (; j<range.length ;) {
84     bool found = false;
85     auto check = range[j];
86     foreach (c ; splits_on) {
87       if (c==check) {
88         found = true;
89         break;
90       }
91     }
92     if (found) {
93       if (!in_whitespace) {
94         tokens[token_num] = range[prev_j..j];
95         token_num++;
96       }
97       prev_j = j+1;
98       in_whitespace = true;
99     }
100     else {
101       in_whitespace = false;
102     }
103     j++;
104   }
105   if (!in_whitespace) { // emit final
106     tokens[token_num] = range[prev_j..$];
107     token_num++;
108   }
109   // tokens.length = token_num;
110   return tokens[0..token_num];
111 }
112 
113 /*
114    Same as above, but with one single heap allocation - it may be slightly
115    slower.
116 */
117 R[] fast_splitter(R)(R range, R splits_on = cast(R)SPLIT_ON) {
118   R[] tokens = new R[range.length];
119   return fast_splitter(tokens,range,splits_on);
120 }
121 
122 unittest {
123   auto s = "hello 1 2 \t3  4 \n";
124   string[16] tokens; // preset buffer
125   assert(fast_splitter(tokens,s) == ["hello", "1", "2", "3", "4"]);
126   assert(fast_splitter("  hello, 1 2 \t3  4 \n") == ["","hello","1","2","3","4"]);
127   assert(fast_splitter("hello, 1 2 \n\t3  4 \n") == ["hello","1","2","3","4"]);
128   assert(fast_splitter(tokens,"chr1:55365,55365,1") == ["chr1:55365","55365","1"]);
129 }