1 /*
2     This file is part of BioD.
3 
4     Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl>
5 */
6 
7 module bio.std.decompress;
8 
9 /**
10    Streaming line reader which can be used for gzipped files. Note the
11    current edition (still) uses the garbage collector. It may help to
12    switch it off or to use the BioD decompressor used by bgzf.
13 
14    Conversion can happen between different encodings, provided the
15    line terminator is ubyte = '\n'. GzipbyLine logic is modeled on
16    ByLineImpl and readln function from std.stdio.
17 */
18 
19 import std.algorithm;
20 import std.conv;
21 import std.exception;
22 import std.file;
23 import std.stdio;
24 import std.zlib: UnCompress;
25 
26 struct GzipbyLine(R) {
27 
28   File f;
29   UnCompress decompress;
30   R line;
31   ubyte[] uncompressed_buf;
32   uint _bufsize;
33 
34   this(string gzipfn, uint bufsize=0x4000) {
35     enforce(gzipfn.isFile);
36     f = File(gzipfn,"r");
37     decompress = new UnCompress();
38     _bufsize = bufsize;
39   }
40 
41   @disable this(this); // disable copy semantics;
42 
43   int opApply(scope int delegate(R) dg) {
44 
45     // chunk_byLine takes a buffer and splits on \n.
46     R chunk_byLine(R head, R rest) {
47       auto split = findSplitAfter(rest,"\n");
48       // If a new line is found split the in left and right.
49       auto left = split[0]; // includes eol splitter
50       auto right = split[1];
51       if (left.length > 0) { // we have a match!
52         dg(head ~ left);
53         return chunk_byLine([], right);
54       }
55       // no match
56       return head ~ right;
57     }
58 
59     R tail; // tail of previous buffer
60     foreach (ubyte[] buffer; f.byChunk(_bufsize))
61     {
62       auto buf = cast(R)decompress.uncompress(buffer);
63       tail = chunk_byLine(tail,buf);
64     }
65     if (tail.length > 0) dg(tail);
66     return 0;
67   }
68 }
69 
70 unittest {
71 
72   import std.algorithm.comparison : equal;
73 
74   // writeln("Testing GzipbyLine");
75   int[] a = [ 1, 2, 4, 7, 7, 2, 4, 7, 3, 5];
76   auto b = findSplitAfter(a, [7]);
77   assert(equal(b[0],[1, 2, 4, 7]));
78   assert(equal(b[1],[7, 2, 4, 7, 3, 5]));
79   auto b1 = findSplitAfter(b[1], [7]);
80   assert(equal(b1[0],[7]));
81   assert(equal(b1[1],[2, 4, 7, 3, 5]));
82   auto b2 = findSplitAfter([2, 4, 3], [7]);
83   assert(equal(b2[0],cast(ubyte[])[]));
84   assert(equal(b2[1],[2,4,3]));
85 
86   uint lines = 0;
87   uint chars = 0;
88   foreach(ubyte[] s; GzipbyLine!(ubyte[])("../test/data/BXD_geno.txt.gz")) {
89     // test file contains 7320 lines 4707218 characters
90     // write(cast(string)s);
91     chars += s.length;
92     lines += 1;
93   }
94   assert(chars == 4707218,"chars " ~ to!string(chars));
95   assert(lines == 7320,"lines " ~ to!string(lines));
96 }