1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2014    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.core.bgzf.block;
25 
26 import bio.bam.constants;
27 import bio.core.utils.memoize;
28 import bio.core.utils.zlib;
29 
30 import std.array;
31 import std.conv;
32 import std.algorithm;
33 import std.exception;
34 
35 /**
36   Structure representing BGZF block.
37   In general, users shouldn't use it, as it is EXTREMELY low-level.
38  */
39 struct BgzfBlock {
40     // field types are as in the SAM/BAM specification
41     // ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
42 
43     public ulong start_offset; /// start offset in the file, in bytes
44 
45     /// end offset in the file, in bytes
46     public ulong end_offset() @property const {
47         return start_offset + bsize + 1;
48     }
49 
50     public ushort bsize; /// total Block SIZE minus one
51 
52     public ushort cdata_size; /// compressed data size
53 
54     /// A buffer is used to reduce number of allocations.
55     ///
56     /// Its size is max(cdata_size, input_size)
57     /// Initially, it contains compressed data, but is rewritten
58     /// during decompressBgzfBlock -- indeed, who cares about
59     /// compressed data after it has been uncompressed?
60     public ubyte[] _buffer = void;
61 
62     /// If block has been already decompressed, result is undefined.
63     public inout(ubyte[]) compressed_data() @property inout pure @safe nothrow {
64         return _buffer[0 .. cast(size_t)cdata_size];
65     }
66 
67     public uint crc32;
68     public uint input_size; /// size of uncompressed data
69 
70     bool dirty;
71 
72     hash_t toHash() const pure @safe nothrow {
73         assert(!dirty);
74         return crc32;
75     }
76 
77     bool opEquals(const ref BgzfBlock other) pure @safe nothrow {
78         assert(!dirty);
79         return opCmp(other) == 0;
80     }
81 
82     int opCmp(const ref BgzfBlock other) const pure @safe nothrow {
83         assert(!dirty);
84         if (cdata_size < other.cdata_size)
85             return -1;
86         if (cdata_size > other.cdata_size)
87             return 1;
88         return std.algorithm.cmp(compressed_data, other.compressed_data);
89     }
90 }
91 
92 /**
93   Struct representing decompressed BgzfBlock
94 
95   Start offset is needed to be able to tell current virtual offset,
96   and yet be able to decompress blocks in parallel.
97  */
98 struct DecompressedBgzfBlock {
99     ulong start_offset;
100     ulong end_offset;
101     ubyte[] decompressed_data;
102 }
103 
104 ///
105 alias Cache!(BgzfBlock, DecompressedBgzfBlock) BgzfBlockCache;
106 
107 /// Function for BGZF block decompression.
108 /// Reuses buffer allocated for storing compressed data,
109 /// i.e. after execution buffer of the passed $(D block)
110 /// is overwritten with uncompressed data.
111 DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block,
112                                           BgzfBlockCache cache=null)
113 {
114     if (block.input_size == 0) {
115         return DecompressedBgzfBlock(block.start_offset, 
116                                      block.start_offset + block.bsize + 1,
117                                      cast(ubyte[])[]); // EOF marker
118         // TODO: add check for correctness of EOF marker
119     }
120 
121     if (cache !is null) {
122         auto ptr = cache.lookup(block);
123         if (ptr !is null)
124             return *ptr;
125     }
126 
127     int err = void;
128 
129     // allocate buffer on the stack
130     ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void;
131 
132     // check that block follows BAM specification
133     enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE, 
134             "Uncompressed block size must be within " ~ 
135             to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes");
136 
137     // for convenience, provide a slice
138     auto uncompressed = uncompressed_buf[0 .. block.input_size];
139 
140     // set input data
141     bio.core.utils.zlib.z_stream zs;
142     zs.next_in = cast(typeof(zs.next_in))block.compressed_data;
143     zs.avail_in = to!uint(block.compressed_data.length);
144 
145     err = bio.core.utils.zlib.inflateInit2(&zs, /* winbits = */-15);
146     if (err)
147     {
148         throw new ZlibException(err);
149     }
150 
151     // uncompress it into a buffer on the stack
152     zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
153     zs.avail_out = block.input_size;
154 
155     err = bio.core.utils.zlib.inflate(&zs, Z_FINISH);
156     switch (err)
157     {
158         case Z_STREAM_END:
159             assert(zs.total_out == block.input_size);
160             err = bio.core.utils.zlib.inflateEnd(&zs);
161             if (err != Z_OK) {
162                 throw new ZlibException(err);
163             }
164             break;
165         default:
166             bio.core.utils.zlib.inflateEnd(&zs);
167             throw new ZlibException(err);
168     }
169 
170     assert(block.crc32 == crc32(0, uncompressed[]));
171 
172     if (cache !is null) {
173         BgzfBlock compressed_bgzf_block = block;
174         compressed_bgzf_block._buffer = block._buffer.dup;
175         DecompressedBgzfBlock decompressed_bgzf_block;
176         with (decompressed_bgzf_block) {
177             start_offset = block.start_offset;
178             end_offset = block.end_offset;
179             decompressed_data = uncompressed[].dup;
180         }
181         cache.put(compressed_bgzf_block, decompressed_bgzf_block);
182     }
183 
184     // Now copy back to block._buffer, overwriting existing data.
185     // It should have enough bytes already allocated.
186     assert(block._buffer.length >= block.input_size);
187     version(extraVerbose) {
188         import std.stdio;
189         stderr.writeln("[uncompressed] [write] range: ", block._buffer.ptr,
190                        " - ", block._buffer.ptr + block.input_size);
191     }
192     block._buffer[0 .. block.input_size] = uncompressed[];
193     block.dirty = true;
194 
195     return DecompressedBgzfBlock(block.start_offset, block.end_offset,
196                                  block._buffer[0 .. block.input_size]);
197 }