1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2017 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.std.hts.bam.bai.indexing; 25 26 import bio.std.hts.bam.reader; 27 import bio.std.hts.bam.readrange; 28 import bio.std.hts.bam.constants; 29 30 import bio.std.hts.bam.bai.bin; 31 import bio.core.bgzf.chunk; 32 33 import contrib.undead.stream; 34 import std.array; 35 import std.algorithm; 36 import std.system; 37 import std.exception; 38 import core.stdc..string; 39 40 // Suppose we have an alignment which covers bases on a reference, 41 // starting from one position and ending at another position. 42 // In order to build linear index, we need to find to which windows 43 // the two positions correspond. 44 // 45 // 46 // (K = 16384) 47 // 48 // [0, K)[K, 2K)[2K, 3K)... <- windows 49 // [.......) <- alignment 50 // 51 private size_t toLinearIndexOffset(int position) { 52 return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE; 53 } 54 55 /// 56 struct IndexBuilder { 57 private { 58 // array of linear offsets for the current reference entry 59 VirtualOffset[] _linear_index; 60 // (maximum index in _linear_index where data was written) + 1 61 size_t _linear_index_write_length; 62 63 static struct PreviousRead { 64 int ref_id = -1; 65 int position; 66 int end_position; 67 int basesCovered() { return end_position - position; } 68 Bin bin; 69 bool is_unmapped; 70 71 private char[256] _name_buf; // by spec., name length is <= 255 chars 72 string name; 73 74 VirtualOffset start_virtual_offset; 75 VirtualOffset end_virtual_offset; 76 } 77 78 PreviousRead _prev_read; 79 80 Stream _stream; 81 int _n_refs; 82 83 ulong _no_coord = 0; 84 // metadata for each reference 85 ulong _beg_vo = -1UL; 86 ulong _end_vo = 0; 87 ulong _unmapped = 0; 88 ulong _mapped = 0; 89 90 bool _first_read = true; 91 92 // map: bin ID -> array of chunks 93 Chunk[][uint] _chunks; 94 95 VirtualOffset _current_chunk_beg; // start of current chunk 96 97 // no metadata for empty references 98 void writeEmptyReference() { 99 _stream.write(cast(int)0); // n_bins 100 _stream.write(cast(int)0); // n_intv 101 } 102 103 void updateLastReadInfo(ref BamReadBlock read) { 104 with (_prev_read) { 105 ref_id = read.ref_id; 106 position = read.position; 107 end_position = position + read.basesCovered(); 108 bin = read.bin; 109 is_unmapped = read.is_unmapped; 110 _name_buf[0 .. read.name.length] = read.name[]; 111 name = cast(string)_name_buf[0 .. read.name.length]; 112 start_virtual_offset = read.start_virtual_offset; 113 end_virtual_offset = read.end_virtual_offset; 114 } 115 } 116 117 void updateMetadata(ref BamReadBlock read) { 118 if (read.ref_id == -1) { 119 ++_no_coord; 120 } else { 121 if (read.is_unmapped) { 122 ++_unmapped; 123 } else { 124 ++_mapped; 125 } 126 127 if (_beg_vo == -1UL) 128 _beg_vo = cast(ulong)read.start_virtual_offset; 129 _end_vo = cast(ulong)read.end_virtual_offset; 130 } 131 } 132 133 void updateLinearIndex() { 134 assert(_prev_read.ref_id >= 0); 135 136 size_t beg, end; 137 138 if (_prev_read.is_unmapped) { 139 end = beg = toLinearIndexOffset(_prev_read.position); 140 } else { 141 beg = toLinearIndexOffset(_prev_read.position); 142 end = toLinearIndexOffset(_prev_read.position + _prev_read.basesCovered() - 1); 143 } 144 145 debug { 146 import std.stdio; 147 if (end >= _linear_index.length) { 148 writeln("beg: ", beg); 149 writeln("end: ", end); 150 writeln("pos: ", _prev_read.position); 151 writeln("bases: ", _prev_read.basesCovered()); 152 } 153 } 154 155 foreach (i; beg .. end + 1) 156 if (_linear_index[i] == 0UL) 157 _linear_index[i] = _prev_read.start_virtual_offset; 158 159 if (end + 1 > _linear_index_write_length) 160 _linear_index_write_length = end + 1; 161 } 162 163 void dumpCurrentLinearIndex() { 164 _stream.write(cast(int)_linear_index_write_length); 165 166 // 167 // There might be untouched places in linear index 168 // with virtual offset equal to zero. 169 // However, it's not a good idea to leave those zeros, 170 // since we can start lookup from the last non-zero virtual offset 171 // encountered before the untouched window. 172 // 173 VirtualOffset last_voffset = 0; 174 175 foreach (voffset; _linear_index[0 .. _linear_index_write_length]) { 176 if (voffset == 0) 177 voffset = last_voffset; 178 else 179 last_voffset = voffset; 180 _stream.write(cast(ulong)voffset); 181 } 182 } 183 184 void dumpCurrentReference() { 185 // +1 because we output dummy bin, too 186 _stream.write(cast(int)(_chunks.length + 1)); 187 188 foreach (bin_id, bin_chunks; _chunks) { 189 if (bin_chunks.length > 0) { 190 _stream.write(cast(uint)bin_id); 191 _stream.write(cast(int)bin_chunks.length); 192 foreach (chunk; bin_chunks) { 193 _stream.write(cast(ulong)chunk.beg); 194 _stream.write(cast(ulong)chunk.end); 195 } 196 } 197 } 198 _stream.write(cast(uint)37450); 199 _stream.write(cast(int)2); 200 _stream.write(cast(ulong)_beg_vo); 201 _stream.write(cast(ulong)_end_vo); 202 _stream.write(cast(ulong)_mapped); 203 _stream.write(cast(ulong)_unmapped); 204 205 dumpCurrentLinearIndex(); 206 207 // reset data 208 memset(_linear_index.ptr, 0, _linear_index.length * ulong.sizeof); 209 _linear_index_write_length = 0; 210 _chunks = null; 211 _current_chunk_beg = _prev_read.end_virtual_offset; 212 213 _beg_vo = _end_vo = cast(ulong)_current_chunk_beg; 214 _unmapped = 0; 215 _mapped = 0; 216 } 217 218 // adds chunk to the current bin (which is determined from _prev_read) 219 void updateChunks() { 220 auto current_chunk_end = _prev_read.end_virtual_offset; 221 222 auto bin_id = _prev_read.bin.id; 223 224 if (bin_id !in _chunks) 225 _chunks[bin_id] = []; 226 auto cs = _chunks[bin_id]; 227 228 bool canMergeWithPreviousChunk() { 229 assert(cs.length > 0); 230 auto last_chunk = cs[$ - 1]; 231 232 if (last_chunk.end.coffset == _current_chunk_beg.coffset) 233 return true; 234 235 return false; 236 } 237 238 if (cs.length == 0 || !canMergeWithPreviousChunk()) { 239 auto new_chunk = Chunk(_current_chunk_beg, current_chunk_end); 240 _chunks[_prev_read.bin.id] ~= new_chunk; 241 } else { 242 _chunks[_prev_read.bin.id][$ - 1].end = current_chunk_end; 243 } 244 245 _current_chunk_beg = current_chunk_end; 246 } 247 248 void checkThatBinIsCorrect(ref BamReadBlock read) { 249 if (!check_bins) 250 return; 251 auto expected = reg2bin(read.position, 252 read.position + read.basesCovered()); 253 enforce(read.bin.id == expected, 254 "Bin in read with name '" ~ read.name ~ 255 "' is set incorrectly (" ~ to!string(read.bin.id) ~ 256 " instead of expected " ~ to!string(expected) ~ ")"); 257 } 258 259 void checkThatInputIsSorted(ref BamReadBlock read) { 260 if (_first_read) return; 261 if (read.ref_id == -1) return; // unmapped 262 if (_prev_read.ref_id < read.ref_id) return; 263 264 enforce(read.ref_id == _prev_read.ref_id && 265 read.position >= _prev_read.position, 266 "BAM file is not coordinate-sorted: " ~ 267 "read '" ~ read.name ~ "' (" ~ read.ref_id.to!string ~ ":" ~ read.position.to!string ~ ")" ~ 268 " must be after read '" ~ _prev_read.name ~ "' (" ~ _prev_read.ref_id.to!string ~ ":" ~ _prev_read.position.to!string ~ ")" ~ 269 "' (at virtual offsets " ~ 270 to!string(_prev_read.start_virtual_offset) ~ ", " ~ read.start_virtual_offset.to!string ~ ")"); 271 } 272 } 273 274 /// 275 this(Stream output_stream, int number_of_references) { 276 _stream = new EndianStream(output_stream, Endian.littleEndian); 277 _n_refs = number_of_references; 278 279 size_t size = BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1; 280 _linear_index = new VirtualOffset[](size); 281 282 _stream.writeString(BAI_MAGIC); // write BAI magic string 283 _stream.write(cast(int)_n_refs); // and number of references 284 } 285 286 /// Check that bins are correct. 287 bool check_bins; 288 289 /// Add a read. The reads must be put in coordinate-sorted order. 290 void put(BamReadBlock read) { 291 checkThatInputIsSorted(read); 292 scope(exit) updateMetadata(read); 293 294 if (read.ref_id < 0) 295 return; 296 297 // start position is unavailable, skip 298 if (read.position < 0) 299 return; 300 301 if (_first_read) { 302 updateLastReadInfo(read); 303 _first_read = false; 304 _current_chunk_beg = read.start_virtual_offset; 305 306 if (read.ref_id > 0) 307 foreach (i; 0 .. read.ref_id) 308 writeEmptyReference(); 309 310 return; 311 } 312 313 checkThatBinIsCorrect(read); 314 315 // new reference, so write data for previous one(s) 316 if (read.ref_id > _prev_read.ref_id) { 317 updateLinearIndex(); 318 updateChunks(); 319 dumpCurrentReference(); 320 321 foreach (i; _prev_read.ref_id + 1 .. read.ref_id) 322 writeEmptyReference(); 323 } 324 325 if (read.ref_id == _prev_read.ref_id) { 326 updateLinearIndex(); 327 328 if (read.bin.id != _prev_read.bin.id) 329 updateChunks(); 330 } 331 332 updateLastReadInfo(read); 333 } 334 335 /// Closes the stream 336 void finish() { 337 if (!_first_read) { // at least one was processed 338 assert(_prev_read.ref_id >= 0); 339 updateLinearIndex(); 340 updateChunks(); 341 dumpCurrentReference(); 342 } 343 344 // _prev_read.ref_id == -1 if all are unmapped 345 foreach (i; _prev_read.ref_id + 1 .. _n_refs) 346 writeEmptyReference(); 347 348 _stream.write(cast(ulong)_no_coord); 349 _stream.close(); 350 } 351 } 352 353 /// Writes BAM index to the $(D stream) 354 /// 355 /// Accepts optional $(D progressBarFunc) 356 void createIndex(BamReader bam, Stream stream, bool check_bins=false, 357 void delegate(lazy float p) progressBarFunc=null) 358 { 359 auto n_refs = cast(int)bam.reference_sequences.length; 360 auto index_builder = IndexBuilder(stream, n_refs); 361 index_builder.check_bins = check_bins; 362 auto reads = bam.readsWithProgress!withOffsets(progressBarFunc); 363 foreach (read; reads) 364 index_builder.put(read); 365 index_builder.finish(); 366 }