1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2014 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.core.bgzf.block; 25 26 import bio.std.hts.bam.constants; 27 // import bio.core.utils.memoize; 28 import bio.core.utils.zlib; 29 30 import std.array; 31 import std.conv; 32 import std.algorithm; 33 import std.exception; 34 35 /** 36 Structure representing BGZF block. 37 In general, users shouldn't use it, as it is EXTREMELY low-level. 38 39 Note it is a struct that has support for comparison based 40 on its crc32 value. 41 */ 42 struct BgzfBlock { 43 // field types are as in the SAM/BAM specification 44 // ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t 45 46 public ulong start_offset; /// start offset in the file, in bytes 47 48 /// end offset in the file, in bytes 49 public ulong end_offset() @property const { 50 return start_offset + bsize + 1; 51 } 52 53 public ushort bsize; /// total Block SIZE minus one 54 55 public ushort cdata_size; /// compressed data size 56 57 /// A buffer is used to reduce number of allocations. 58 /// 59 /// Its size is max(cdata_size, input_size) 60 /// Initially, it contains compressed data, but is rewritten 61 /// during decompressBgzfBlock -- indeed, who cares about 62 /// compressed data after it has been uncompressed? 63 public ubyte[] _buffer = void; 64 65 /// If block has been already decompressed, result is undefined. 66 public inout(ubyte[]) compressed_data() @property inout pure @safe nothrow { 67 return _buffer[0 .. cast(size_t)cdata_size]; 68 } 69 70 public uint crc32; 71 public uint input_size; /// size of uncompressed data 72 73 bool dirty; 74 75 hash_t toHash() const pure @safe nothrow { 76 assert(!dirty); 77 return crc32; 78 } 79 80 bool opEquals(const ref BgzfBlock other) pure @safe nothrow { 81 assert(!dirty); 82 return opCmp(other) == 0; 83 } 84 85 int opCmp(const ref BgzfBlock other) const pure @safe nothrow { 86 assert(!dirty); 87 if (cdata_size < other.cdata_size) 88 return -1; 89 if (cdata_size > other.cdata_size) 90 return 1; 91 return std.algorithm.cmp(compressed_data, other.compressed_data); 92 } 93 } 94 95 import std.stdio; 96 97 /** 98 Struct representing decompressed BgzfBlock 99 100 Start offset is needed to be able to tell current virtual offset, 101 and yet be able to decompress blocks in parallel. 102 */ 103 struct DecompressedBgzfBlock { 104 /* For the class version: 105 this(ulong start, ulong end, ubyte[] buf) { 106 start_offset = start; 107 end_offset = end; 108 decompressed_data = buf; 109 } 110 ~this() { 111 stderr.writeln("destroy DecompressedBgzfBlock ",start_offset,":",end_offset," ",decompressed_data.sizeof); 112 }; 113 */ 114 115 ulong start_offset; 116 ulong end_offset; 117 ubyte[] decompressed_data; 118 } 119 120 /// 121 // alias Cache!(BgzfBlock, DecompressedBgzfBlock) BgzfBlockCache; 122 123 /// Function for BGZF block decompression. 124 /// Reuses buffer allocated for storing compressed data, 125 /// i.e. after execution buffer of the passed $(D block) 126 /// is overwritten with uncompressed data. 127 DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block) 128 { 129 if (block.input_size == 0) { 130 return DecompressedBgzfBlock(block.start_offset, 131 block.start_offset + block.bsize + 1, 132 cast(ubyte[])[]); // EOF marker 133 // TODO: add check for correctness of EOF marker 134 } 135 136 /* 137 if (cache !is null) { 138 auto ptr = cache.lookup(block); 139 if (ptr !is null) 140 return *ptr; 141 } 142 */ 143 144 int err = void; 145 146 // allocate buffer on the stack 147 ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void; 148 149 // check that block follows BAM specification 150 enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE, 151 "Uncompressed block size must be within " ~ 152 to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes"); 153 154 // for convenience, provide a slice 155 auto uncompressed = uncompressed_buf[0 .. block.input_size]; 156 157 // set input data 158 bio.core.utils.zlib.z_stream zs; 159 zs.next_in = cast(typeof(zs.next_in))block.compressed_data; 160 zs.avail_in = to!uint(block.compressed_data.length); 161 162 err = bio.core.utils.zlib.inflateInit2(&zs, /* winbits = */-15); 163 if (err) 164 { 165 throw new ZlibException(err); 166 } 167 168 // uncompress it into a buffer on the stack 169 zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr; 170 zs.avail_out = block.input_size; 171 172 err = bio.core.utils.zlib.inflate(&zs, Z_FINISH); 173 switch (err) 174 { 175 case Z_STREAM_END: 176 assert(zs.total_out == block.input_size); 177 err = bio.core.utils.zlib.inflateEnd(&zs); 178 if (err != Z_OK) { 179 throw new ZlibException(err); 180 } 181 break; 182 default: 183 bio.core.utils.zlib.inflateEnd(&zs); 184 throw new ZlibException(err); 185 } 186 187 assert(block.crc32 == crc32(0, uncompressed[])); 188 189 /* 190 if (cache !is null) { 191 BgzfBlock compressed_bgzf_block = block; 192 compressed_bgzf_block._buffer = block._buffer.dup; 193 DecompressedBgzfBlock decompressed_bgzf_block; 194 with (decompressed_bgzf_block) { 195 start_offset = block.start_offset; 196 end_offset = block.end_offset; 197 decompressed_data = uncompressed[].dup; 198 } 199 cache.put(compressed_bgzf_block, decompressed_bgzf_block); 200 } 201 */ 202 203 // Now copy back to block._buffer, overwriting existing data. 204 // It should have enough bytes already allocated. 205 assert(block._buffer.length >= block.input_size); 206 version(extraVerbose) { 207 import std.stdio; 208 stderr.writeln("[uncompressed] [write] range: ", block._buffer.ptr, 209 " - ", block._buffer.ptr + block.input_size); 210 } 211 block._buffer[0 .. block.input_size] = uncompressed[]; 212 block.dirty = true; 213 214 auto decompressed = DecompressedBgzfBlock(block.start_offset, block.end_offset, block._buffer[0 .. block.input_size]); 215 return decompressed; 216 }