1 /* 2 This file is part of BioD. 3 4 Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl> 5 */ 6 7 module bio.std.range.splitter; 8 9 import std.algorithm; 10 import std.array; 11 import std.conv; 12 import std.exception; 13 import std.stdio; 14 15 import std.range.primitives; 16 17 immutable ubyte[] SPLIT_ON = [ 0x20, 0x09, 0x0A, ';', ',' ]; 18 19 /** 20 SimpleSplitConv takes a range R (typically a text line) and splits 21 it/tokenizes it on a list of characters. Essentially fields/tokens 22 are split by tabs, semi-colons or comma's and spaces. This compares 23 to C's strtok(str, ", \t;"). 24 25 This routine happens often in bioinformatics and is a replacement 26 for the much unsafer C strtok. This edition should also handle 27 UTF. 28 29 The default is to split on space, newline, tab, semi-colon and 30 comma. 31 */ 32 33 struct SimpleSplitConv(R) 34 if (isInputRange!R) 35 { 36 R list, split_on; 37 38 this(R range, R splits_on = cast(R)SPLIT_ON) { 39 list = range; 40 split_on = splits_on; 41 } 42 43 int opApply(scope int delegate(R) dg) { 44 size_t start = 0; 45 bool in_whitespace = false; 46 foreach(size_t pos, c; list) { 47 if (canFind(split_on,c)) { // hit split char 48 if (!in_whitespace) { // emit 49 auto token = list[start..pos]; 50 dg(token); 51 } 52 start = pos+1; 53 in_whitespace = true; 54 } else { 55 in_whitespace = false; 56 } 57 } 58 if (!in_whitespace) { // emit final 59 auto token = list[start..$]; 60 dg(token); 61 } 62 return 0; 63 } 64 } 65 66 unittest { 67 auto s = cast(ubyte[])"hello 1 2 \t3 4 \n"; 68 assert(array(SimpleSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]); 69 assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])" hello, 1 2 \t3 4 \n")) == ["","hello","1","2","3","4"]); 70 assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3 4 \n")) == ["hello","1","2","3","4"]); 71 assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"chr1:55365,55365,1")) == ["chr1:55365","55365","1"]); 72 } 73 74 /* 75 Dirty fast_splitter is 3x faster than above elegant version. It does no heap 76 allocations. 77 */ 78 R[] fast_splitter(R)(R[] tokens, R range, R splits_on = cast(R)SPLIT_ON) @nogc { 79 // R[] tokens = new R[range.length]; // pre-allocate optimistially 80 auto j = 0, prev_j = 0; 81 bool in_whitespace = false; 82 auto token_num = 0; 83 for (; j<range.length ;) { 84 bool found = false; 85 auto check = range[j]; 86 foreach (c ; splits_on) { 87 if (c==check) { 88 found = true; 89 break; 90 } 91 } 92 if (found) { 93 if (!in_whitespace) { 94 tokens[token_num] = range[prev_j..j]; 95 token_num++; 96 } 97 prev_j = j+1; 98 in_whitespace = true; 99 } 100 else { 101 in_whitespace = false; 102 } 103 j++; 104 } 105 if (!in_whitespace) { // emit final 106 tokens[token_num] = range[prev_j..$]; 107 token_num++; 108 } 109 // tokens.length = token_num; 110 return tokens[0..token_num]; 111 } 112 113 /* 114 Same as above, but with one single heap allocation - it may be slightly 115 slower. 116 */ 117 R[] fast_splitter(R)(R range, R splits_on = cast(R)SPLIT_ON) { 118 R[] tokens = new R[range.length]; 119 return fast_splitter(tokens,range,splits_on); 120 } 121 122 unittest { 123 auto s = "hello 1 2 \t3 4 \n"; 124 string[16] tokens; // preset buffer 125 assert(fast_splitter(tokens,s) == ["hello", "1", "2", "3", "4"]); 126 assert(fast_splitter(" hello, 1 2 \t3 4 \n") == ["","hello","1","2","3","4"]); 127 assert(fast_splitter("hello, 1 2 \n\t3 4 \n") == ["hello","1","2","3","4"]); 128 assert(fast_splitter(tokens,"chr1:55365,55365,1") == ["chr1:55365","55365","1"]); 129 }