1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2014    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11 
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14 
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.core.bgzf.block;
25 
26 import bio.std.hts.bam.constants;
27 // import bio.core.utils.memoize;
28 import bio.core.utils.zlib;
29 
30 import std.array;
31 import std.conv;
32 import std.algorithm;
33 import std.exception;
34 
35 /**
36   Structure representing BGZF block.
37   In general, users shouldn't use it, as it is EXTREMELY low-level.
38 
39   Note it is a struct that has support for comparison based
40   on its crc32 value.
41  */
42 struct BgzfBlock {
43     // field types are as in the SAM/BAM specification
44     // ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
45 
46     public ulong start_offset; /// start offset in the file, in bytes
47 
48     /// end offset in the file, in bytes
49     public ulong end_offset() @property const {
50         return start_offset + bsize + 1;
51     }
52 
53     public ushort bsize; /// total Block SIZE minus one
54 
55     public ushort cdata_size; /// compressed data size
56 
57     /// A buffer is used to reduce number of allocations.
58     ///
59     /// Its size is max(cdata_size, input_size)
60     /// Initially, it contains compressed data, but is rewritten
61     /// during decompressBgzfBlock -- indeed, who cares about
62     /// compressed data after it has been uncompressed?
63     public ubyte[] _buffer = void;
64 
65     /// If block has been already decompressed, result is undefined.
66     public inout(ubyte[]) compressed_data() @property inout pure @safe nothrow {
67         return _buffer[0 .. cast(size_t)cdata_size];
68     }
69 
70     public uint crc32;
71     public uint input_size; /// size of uncompressed data
72 
73     bool dirty;
74 
75     hash_t toHash() const pure @safe nothrow {
76         assert(!dirty);
77         return crc32;
78     }
79 
80     bool opEquals(const ref BgzfBlock other) pure @safe nothrow {
81         assert(!dirty);
82         return opCmp(other) == 0;
83     }
84 
85     int opCmp(const ref BgzfBlock other) const pure @safe nothrow {
86         assert(!dirty);
87         if (cdata_size < other.cdata_size)
88             return -1;
89         if (cdata_size > other.cdata_size)
90             return 1;
91         return std.algorithm.cmp(compressed_data, other.compressed_data);
92     }
93 }
94 
95 import std.stdio;
96 
97 /**
98   Struct representing decompressed BgzfBlock
99 
100   Start offset is needed to be able to tell current virtual offset,
101   and yet be able to decompress blocks in parallel.
102  */
103 struct DecompressedBgzfBlock {
104   /* For the class version:
105   this(ulong start, ulong end, ubyte[] buf) {
106     start_offset = start;
107     end_offset = end;
108     decompressed_data = buf;
109   }
110   ~this() {
111     stderr.writeln("destroy DecompressedBgzfBlock ",start_offset,":",end_offset," ",decompressed_data.sizeof);
112   };
113   */
114 
115   ulong start_offset;
116   ulong end_offset;
117   ubyte[] decompressed_data;
118 }
119 
120 ///
121 // alias Cache!(BgzfBlock, DecompressedBgzfBlock) BgzfBlockCache;
122 
123 /// Function for BGZF block decompression.
124 /// Reuses buffer allocated for storing compressed data,
125 /// i.e. after execution buffer of the passed $(D block)
126 /// is overwritten with uncompressed data.
127 DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block)
128 {
129     if (block.input_size == 0) {
130       return DecompressedBgzfBlock(block.start_offset,
131                                    block.start_offset + block.bsize + 1,
132                                    cast(ubyte[])[]); // EOF marker
133       // TODO: add check for correctness of EOF marker
134     }
135 
136     /*
137     if (cache !is null) {
138         auto ptr = cache.lookup(block);
139         if (ptr !is null)
140             return *ptr;
141     }
142     */
143 
144     int err = void;
145 
146     // allocate buffer on the stack
147     ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void;
148 
149     // check that block follows BAM specification
150     enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE,
151             "Uncompressed block size must be within " ~
152             to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes");
153 
154     // for convenience, provide a slice
155     auto uncompressed = uncompressed_buf[0 .. block.input_size];
156 
157     // set input data
158     bio.core.utils.zlib.z_stream zs;
159     zs.next_in = cast(typeof(zs.next_in))block.compressed_data;
160     zs.avail_in = to!uint(block.compressed_data.length);
161 
162     err = bio.core.utils.zlib.inflateInit2(&zs, /* winbits = */-15);
163     if (err)
164     {
165         throw new ZlibException(err);
166     }
167 
168     // uncompress it into a buffer on the stack
169     zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
170     zs.avail_out = block.input_size;
171 
172     err = bio.core.utils.zlib.inflate(&zs, Z_FINISH);
173     switch (err)
174     {
175         case Z_STREAM_END:
176             assert(zs.total_out == block.input_size);
177             err = bio.core.utils.zlib.inflateEnd(&zs);
178             if (err != Z_OK) {
179                 throw new ZlibException(err);
180             }
181             break;
182         default:
183             bio.core.utils.zlib.inflateEnd(&zs);
184             throw new ZlibException(err);
185     }
186 
187     assert(block.crc32 == crc32(0, uncompressed[]));
188 
189     /*
190     if (cache !is null) {
191         BgzfBlock compressed_bgzf_block = block;
192         compressed_bgzf_block._buffer = block._buffer.dup;
193         DecompressedBgzfBlock decompressed_bgzf_block;
194         with (decompressed_bgzf_block) {
195             start_offset = block.start_offset;
196             end_offset = block.end_offset;
197             decompressed_data = uncompressed[].dup;
198         }
199         cache.put(compressed_bgzf_block, decompressed_bgzf_block);
200     }
201     */
202 
203     // Now copy back to block._buffer, overwriting existing data.
204     // It should have enough bytes already allocated.
205     assert(block._buffer.length >= block.input_size);
206     version(extraVerbose) {
207         import std.stdio;
208         stderr.writeln("[uncompressed] [write] range: ", block._buffer.ptr,
209                        " - ", block._buffer.ptr + block.input_size);
210     }
211     block._buffer[0 .. block.input_size] = uncompressed[];
212     block.dirty = true;
213 
214     auto decompressed = DecompressedBgzfBlock(block.start_offset, block.end_offset, block._buffer[0 .. block.input_size]);
215     return decompressed;
216 }