1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2014 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.core.bgzf.block; 25 26 import bio.bam.constants; 27 import bio.core.utils.memoize; 28 import bio.core.utils.zlib; 29 30 import std.array; 31 import std.conv; 32 import std.algorithm; 33 import std.exception; 34 35 /** 36 Structure representing BGZF block. 37 In general, users shouldn't use it, as it is EXTREMELY low-level. 38 */ 39 struct BgzfBlock { 40 // field types are as in the SAM/BAM specification 41 // ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t 42 43 public ulong start_offset; /// start offset in the file, in bytes 44 45 /// end offset in the file, in bytes 46 public ulong end_offset() @property const { 47 return start_offset + bsize + 1; 48 } 49 50 public ushort bsize; /// total Block SIZE minus one 51 52 public ushort cdata_size; /// compressed data size 53 54 /// A buffer is used to reduce number of allocations. 55 /// 56 /// Its size is max(cdata_size, input_size) 57 /// Initially, it contains compressed data, but is rewritten 58 /// during decompressBgzfBlock -- indeed, who cares about 59 /// compressed data after it has been uncompressed? 60 public ubyte[] _buffer = void; 61 62 /// If block has been already decompressed, result is undefined. 63 public inout(ubyte[]) compressed_data() @property inout pure @safe nothrow { 64 return _buffer[0 .. cast(size_t)cdata_size]; 65 } 66 67 public uint crc32; 68 public uint input_size; /// size of uncompressed data 69 70 bool dirty; 71 72 hash_t toHash() const pure @safe nothrow { 73 assert(!dirty); 74 return crc32; 75 } 76 77 bool opEquals(const ref BgzfBlock other) pure @safe nothrow { 78 assert(!dirty); 79 return opCmp(other) == 0; 80 } 81 82 int opCmp(const ref BgzfBlock other) const pure @safe nothrow { 83 assert(!dirty); 84 if (cdata_size < other.cdata_size) 85 return -1; 86 if (cdata_size > other.cdata_size) 87 return 1; 88 return std.algorithm.cmp(compressed_data, other.compressed_data); 89 } 90 } 91 92 /** 93 Struct representing decompressed BgzfBlock 94 95 Start offset is needed to be able to tell current virtual offset, 96 and yet be able to decompress blocks in parallel. 97 */ 98 struct DecompressedBgzfBlock { 99 ulong start_offset; 100 ulong end_offset; 101 ubyte[] decompressed_data; 102 } 103 104 /// 105 alias Cache!(BgzfBlock, DecompressedBgzfBlock) BgzfBlockCache; 106 107 /// Function for BGZF block decompression. 108 /// Reuses buffer allocated for storing compressed data, 109 /// i.e. after execution buffer of the passed $(D block) 110 /// is overwritten with uncompressed data. 111 DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block, 112 BgzfBlockCache cache=null) 113 { 114 if (block.input_size == 0) { 115 return DecompressedBgzfBlock(block.start_offset, 116 block.start_offset + block.bsize + 1, 117 cast(ubyte[])[]); // EOF marker 118 // TODO: add check for correctness of EOF marker 119 } 120 121 if (cache !is null) { 122 auto ptr = cache.lookup(block); 123 if (ptr !is null) 124 return *ptr; 125 } 126 127 int err = void; 128 129 // allocate buffer on the stack 130 ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void; 131 132 // check that block follows BAM specification 133 enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE, 134 "Uncompressed block size must be within " ~ 135 to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes"); 136 137 // for convenience, provide a slice 138 auto uncompressed = uncompressed_buf[0 .. block.input_size]; 139 140 // set input data 141 bio.core.utils.zlib.z_stream zs; 142 zs.next_in = cast(typeof(zs.next_in))block.compressed_data; 143 zs.avail_in = to!uint(block.compressed_data.length); 144 145 err = bio.core.utils.zlib.inflateInit2(&zs, /* winbits = */-15); 146 if (err) 147 { 148 throw new ZlibException(err); 149 } 150 151 // uncompress it into a buffer on the stack 152 zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr; 153 zs.avail_out = block.input_size; 154 155 err = bio.core.utils.zlib.inflate(&zs, Z_FINISH); 156 switch (err) 157 { 158 case Z_STREAM_END: 159 assert(zs.total_out == block.input_size); 160 err = bio.core.utils.zlib.inflateEnd(&zs); 161 if (err != Z_OK) { 162 throw new ZlibException(err); 163 } 164 break; 165 default: 166 bio.core.utils.zlib.inflateEnd(&zs); 167 throw new ZlibException(err); 168 } 169 170 assert(block.crc32 == crc32(0, uncompressed[])); 171 172 if (cache !is null) { 173 BgzfBlock compressed_bgzf_block = block; 174 compressed_bgzf_block._buffer = block._buffer.dup; 175 DecompressedBgzfBlock decompressed_bgzf_block; 176 with (decompressed_bgzf_block) { 177 start_offset = block.start_offset; 178 end_offset = block.end_offset; 179 decompressed_data = uncompressed[].dup; 180 } 181 cache.put(compressed_bgzf_block, decompressed_bgzf_block); 182 } 183 184 // Now copy back to block._buffer, overwriting existing data. 185 // It should have enough bytes already allocated. 186 assert(block._buffer.length >= block.input_size); 187 version(extraVerbose) { 188 import std.stdio; 189 stderr.writeln("[uncompressed] [write] range: ", block._buffer.ptr, 190 " - ", block._buffer.ptr + block.input_size); 191 } 192 block._buffer[0 .. block.input_size] = uncompressed[]; 193 block.dirty = true; 194 195 return DecompressedBgzfBlock(block.start_offset, block.end_offset, 196 block._buffer[0 .. block.input_size]); 197 }