1 /* 2 This file is part of BioD. 3 4 Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl> 5 */ 6 7 module bio.std.decompress; 8 9 /** 10 Streaming line reader which can be used for gzipped files. Note the 11 current edition (still) uses the garbage collector. It may help to 12 switch it off or to use the BioD decompressor used by bgzf. 13 14 Conversion can happen between different encodings, provided the 15 line terminator is ubyte = '\n'. GzipbyLine logic is modeled on 16 ByLineImpl and readln function from std.stdio. 17 */ 18 19 import std.algorithm; 20 import std.conv; 21 import std.exception; 22 import std.file; 23 import std.stdio; 24 import std.zlib: UnCompress; 25 26 struct GzipbyLine(R) { 27 28 File f; 29 UnCompress decompress; 30 R line; 31 ubyte[] uncompressed_buf; 32 uint _bufsize; 33 34 this(string gzipfn, uint bufsize=0x4000) { 35 enforce(gzipfn.isFile); 36 f = File(gzipfn,"r"); 37 decompress = new UnCompress(); 38 _bufsize = bufsize; 39 } 40 41 @disable this(this); // disable copy semantics; 42 43 int opApply(scope int delegate(R) dg) { 44 45 // chunk_byLine takes a buffer and splits on \n. 46 R chunk_byLine(R head, R rest) { 47 auto split = findSplitAfter(rest,"\n"); 48 // If a new line is found split the in left and right. 49 auto left = split[0]; // includes eol splitter 50 auto right = split[1]; 51 if (left.length > 0) { // we have a match! 52 dg(head ~ left); 53 return chunk_byLine([], right); 54 } 55 // no match 56 return head ~ right; 57 } 58 59 R tail; // tail of previous buffer 60 foreach (ubyte[] buffer; f.byChunk(_bufsize)) 61 { 62 auto buf = cast(R)decompress.uncompress(buffer); 63 tail = chunk_byLine(tail,buf); 64 } 65 if (tail.length > 0) dg(tail); 66 return 0; 67 } 68 } 69 70 unittest { 71 72 import std.algorithm.comparison : equal; 73 74 // writeln("Testing GzipbyLine"); 75 int[] a = [ 1, 2, 4, 7, 7, 2, 4, 7, 3, 5]; 76 auto b = findSplitAfter(a, [7]); 77 assert(equal(b[0],[1, 2, 4, 7])); 78 assert(equal(b[1],[7, 2, 4, 7, 3, 5])); 79 auto b1 = findSplitAfter(b[1], [7]); 80 assert(equal(b1[0],[7])); 81 assert(equal(b1[1],[2, 4, 7, 3, 5])); 82 auto b2 = findSplitAfter([2, 4, 3], [7]); 83 assert(equal(b2[0],cast(ubyte[])[])); 84 assert(equal(b2[1],[2,4,3])); 85 86 uint lines = 0; 87 uint chars = 0; 88 foreach(ubyte[] s; GzipbyLine!(ubyte[])("../test/data/BXD_geno.txt.gz")) { 89 // test file contains 7320 lines 4707218 characters 90 // write(cast(string)s); 91 chars += s.length; 92 lines += 1; 93 } 94 assert(chars == 4707218,"chars " ~ to!string(chars)); 95 assert(lines == 7320,"lines " ~ to!string(lines)); 96 }