1 /* 2 New style BAM reader. This file is part of Sambamba. 3 Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> 4 5 Sambamba is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 2 of the License, 8 or (at your option) any later version. 9 10 Sambamba is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 18 02111-1307 USA 19 20 */ 21 22 // This is a complete rewrite of Artem Tarasov's original reader. 23 24 module bio.std.experimental.hts.bam.reader; 25 26 import std.conv; 27 import core.stdc.stdio: fopen, fread, fclose; 28 import std.exception; 29 import std.file; 30 import std.format : format; 31 import std.stdio; 32 import std..string; 33 import std.typecons; 34 import std.bitmanip; 35 36 //TODO remove these dependecies 37 import bio.std.hts.bam.cigar; 38 import bio.std.hts.bam.constants; 39 40 import bio.core.utils.exception; 41 42 import bio.std.experimental.hts.bgzf; 43 import bio.std.experimental.hts.constants; 44 45 import bio.std.experimental.hts.bam.header; 46 47 template ReadFlags(alias flag) { 48 // 0x01: template having multiple segments in sequencing 49 @property bool is_paired() nothrow { return cast(bool)(flag & 0x1); } 50 // 0x2: Each segment properly aligned according to the aligner 51 @property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); } 52 // 0x4: Segment unmapped 53 @property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); } 54 @property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); } 55 // 0x8: Next segment in template unmapped 56 @property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); } 57 // 0x10: SEQ being reverse complimented 58 @property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); } 59 // 0x20: SEQ of the next segment in the template being reverse complemented 60 @property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); } 61 // 0x40: The first segment in the template 62 @property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); } 63 // 0x80: The last segment in the template 64 @property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); } 65 // 0x100: Secondary segment 66 @property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); } 67 // 0x200: Not passing filters, such as platform/vendor quality controls 68 @property bool is_qc_fail() { 69 assert(is_mapped_raw,to!string(this)); 70 return cast(bool)(flag & 0x200); } 71 alias is_qc_fail failed_quality_control; 72 /// 0x400: PCR or optical duplicate 73 @property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); } 74 /// 0x800: Supplementary alignment 75 @property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); } 76 @property string show_flags() { 77 string res = format("b%b-%d",flag,flag); 78 if (is_paired) res ~= " pair"; 79 if (is_proper_pair) res ~= " proper"; 80 if (is_mapped_raw) res ~= " mapped"; 81 if (is_unmapped_raw) res ~= " unmapped"; 82 if (mate_is_unmapped) res ~= " mate_unmapped"; 83 if (is_reverse_strand) res ~= " rev_strand"; 84 if (mate_is_reverse_strand) res ~= " mate_rev_strand"; 85 if (is_first_of_pair) res ~= " first_of_pair"; 86 if (is_second_of_pair) res ~= " second_of_pair"; 87 if (is_secondary_alignment) res ~= " secondary_aln"; 88 if (is_mapped_raw && is_qc_fail) res ~= " qc_fail"; 89 if (is_duplicate) res ~= " duplicate"; 90 if (is_supplementary) res ~= " suppl"; 91 return res; 92 } 93 } 94 95 template CheckMapped(alias refid) { 96 @property nothrow bool is_unmapped() { 97 return is_unmapped_raw; 98 } 99 @property bool is_mapped() { 100 debug { 101 if (is_mapped_raw) { 102 assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec 103 } 104 } 105 return !is_unmapped_raw; 106 } 107 } 108 109 enum Offset { 110 bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24 111 }; 112 113 /** 114 Raw Read buffer containing unparsed data. It should be considered 115 read-only. 116 117 Current implementation is a cluct (class-struct hybrid). The _data 118 pointer is shared when ReadBlob is assigned to another variable 119 (i.e., there is a remote dependency). The advantage is that for 120 each Read data gets allocated on the heap only once. 121 122 All offsets are indexed on init (except for tags). When using 123 fields beyond refid,pos use ProcessReadBlob instead because it 124 caches values. 125 */ 126 127 struct ReadBlob { 128 RefId refid; // -1 is invalid (BAM Spec) 129 GenomePos pos; // 0 coordinate based (BAM spec) 130 private ubyte[] _data; 131 uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max; 132 133 mixin ReadFlags!(_flag); 134 mixin CheckMapped!(refid); 135 136 /* 137 this(RefId ref_id, GenomePos read_pos, ubyte[] buf) { 138 refid = ref_id; 139 pos = read_pos; 140 _data = buf; 141 } 142 */ 143 144 @property void cleanup() { 145 destroy(_data); 146 _data = null; 147 } 148 149 // Turn ReadBlob into class-struct hybrid or a cluct ;) 150 // @disable this(this); // disable copy semantics; 151 152 @property @trusted nothrow private const T fetch(T)(uint raw_offset) { 153 ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof]; 154 return cast(const(T))buf.read!(T,Endian.littleEndian)(); 155 } 156 157 @property @trusted nothrow private const 158 uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); } 159 @property @trusted nothrow private const 160 uint _flag_nc() { return fetch!uint(Offset.flag_nc); } 161 @property @trusted nothrow private const 162 int sequence_length() { return fetch!int(Offset.l_seq); } 163 @property @trusted nothrow private const 164 int _next_refID() { return fetch!int(Offset.next_refID); } 165 @property @trusted nothrow private const 166 int _next_pos() { return fetch!int(Offset.next_pos); } 167 @property @trusted nothrow private const 168 int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN 169 @property @trusted nothrow private const 170 ushort _bin() { return _bin_mq_nl >> 16; } 171 @property @trusted nothrow private const 172 ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; } 173 @property @trusted nothrow private const 174 ubyte _l_read_name() { return _bin_mq_nl & 0xFF; } 175 @property @trusted nothrow private const 176 ushort _flag() { return fetch!ushort(Offset.flag); } 177 @property @trusted nothrow private const 178 ushort _n_cigar_op() { return _flag_nc & 0xFFFF; } 179 @property @trusted nothrow private const 180 uint _read_name_offset() { return Offset.read_name; } 181 @property @trusted nothrow private 182 uint _cigar_offset() { 183 if (offset_cigar == int.max) 184 offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof); 185 return offset_cigar; 186 } 187 @property @trusted nothrow private 188 uint _seq_offset() { 189 if (offset_seq == int.max) 190 offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof); 191 return offset_seq; 192 } 193 @property @trusted nothrow private 194 uint _qual_offset() { 195 if (offset_qual == int.max) 196 offset_qual = _seq_offset + (sequence_length + 1)/2; 197 return offset_qual; 198 } 199 @property @trusted nothrow private 200 uint _tags_offset() { return _qual_offset + sequence_length; } 201 @property @trusted nothrow private 202 ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; } 203 @property @trusted nothrow private 204 ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; } 205 @property @trusted nothrow private 206 ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; } 207 208 alias sequence_length _l_seq; 209 alias _mapq mapping_quality; // MAPQ 210 211 string toString() { 212 return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">"; 213 } 214 215 } 216 217 /** 218 ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use 219 this when you need to access field/elements multiple times. Note 220 that ProcessReadBlob becomes invalid when ReadBlob goes out of scope. 221 */ 222 struct ProcessReadBlob { 223 private Nullable!ReadBlob _read2; 224 Nullable!int sequence_length2; 225 private Nullable!string sequence2, read_name2; 226 private Nullable!CigarOperations cigar2; 227 private Nullable!GenomePos consumed_reference_bases2; 228 229 mixin ReadFlags!(_flag); 230 mixin CheckMapped!(ref_id); 231 232 this(Nullable!ReadBlob _r) { 233 _read2 = _r; 234 } 235 236 @property void cleanup() { 237 _read2.cleanup; 238 } 239 240 @property nothrow bool isNull() { 241 return _read2.isNull; 242 } 243 244 @property RefId ref_id() { 245 enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2)); 246 return _read2.refid; 247 } 248 249 @property RefId raw_ref_id() { 250 return _read2.refid; 251 } 252 253 @property nothrow uint _flag_nc() { 254 return _read2._flag_nc; 255 } 256 257 @property nothrow ushort _flag() { 258 return _read2._flag; 259 } 260 261 alias ref_id refid; 262 263 /// Get the start position on the reference sequence (better use 264 /// start_loc), i.e., the first base that gets consumed in the 265 /// CIGAR. 266 @property GenomePos start_pos() { 267 assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec 268 asserte(_read2.pos < GenomePos.max); 269 return cast(GenomePos)_read2.pos; 270 } 271 272 @property GenomePos raw_start_pos() { 273 return cast(GenomePos)_read2.pos; 274 } 275 276 /// Get the end position on the reference sequence (better use end_loc) 277 @property GenomePos end_pos() { 278 assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence"); 279 assert(!consumed_reference_bases.isNull); 280 return start_pos + consumed_reference_bases; 281 } 282 283 @property GenomePos raw_end_pos() { 284 return raw_start_pos + consumed_reference_bases; 285 } 286 287 @property GenomeLocation start_loc() { 288 return GenomeLocation(ref_id,start_pos); 289 } 290 291 @property GenomeLocation end_loc() { 292 return GenomeLocation(ref_id,end_pos); 293 } 294 295 @property @trusted MappingQuality mapping_quality() { // MAPQ 296 assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec 297 return MappingQuality(_read2.mapping_quality); 298 } 299 300 @property @trusted int tlen() { // do not use 301 return _read2._tlen; 302 } 303 304 @property @trusted GenomePos sequence_length() { 305 if (sequence_length2.isNull) 306 sequence_length2 = _read2.sequence_length; 307 return sequence_length2; 308 } 309 310 /// Count and caches consumed reference bases. Uses raw_cigar to 311 /// avoid a heap allocation. 312 @property @trusted Nullable!GenomePos consumed_reference_bases() { 313 if (consumed_reference_bases2.isNull) { 314 assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec 315 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 316 auto raw = cast(uint[]) _read2.raw_cigar(); 317 if (raw.length==1 && raw[0] == '*') 318 return consumed_reference_bases2; // null 319 else { 320 GenomePos bases = 0; 321 for (size_t i = 0; i < raw.length; i++) { 322 auto cigarop = CigarOperation(raw[i]); 323 if (cigarop.is_reference_consuming) 324 bases += cigarop.length; 325 } 326 consumed_reference_bases2 = bases; 327 } 328 } 329 return consumed_reference_bases2; 330 } 331 332 /// Count query consumed bases. Uses raw_cigar to avoid a heap 333 /// allocation. 334 @property @trusted GenomePos consumed_query_bases() { 335 assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec 336 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 337 auto raw = cast(uint[]) _read2.raw_cigar(); 338 if (raw.length==1 && raw[0] == '*') 339 return consumed_reference_bases2; // null 340 else { 341 GenomePos bases = 0; 342 for (size_t i = 0; i < raw.length; i++) { 343 auto cigarop = CigarOperation(raw[i]); 344 if (cigarop.is_query_consuming) 345 bases += cigarop.length; 346 } 347 return bases; 348 } 349 } 350 351 /// Return read name as a string. If unavailable returns 352 /// null. Caches name. 353 @property Nullable!string read_name() { 354 if (read_name2.isNull) { 355 assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec 356 auto raw = _read2.read_name; 357 if (raw.length == 0 || (raw.length ==1 && raw[0] == '*')) 358 return read_name2; // null 359 assert(raw.length < 255); // BAM spec 360 if (raw[raw.length-1] == 0) // strip trailing C zero 361 raw.length -= 1; 362 read_name2 = Nullable!string(cast(string)raw); 363 } 364 return read_name2; 365 } 366 367 /// Returns Cigar as an array of operations. Returns null if no 368 /// operations. Caches Cigar when there are operations. 369 @property Nullable!CigarOperations cigar() { 370 if (cigar2.isNull) { 371 assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec 372 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 373 auto raw = cast(uint[]) _read2.raw_cigar(); 374 if (raw.length==0 || (raw.length==1 && raw[0] == '*')) 375 return cigar2; // null 376 else { 377 auto s = new CigarOperation[raw.length]; // Heap alloc 378 s.length = 0; 379 for (size_t i = 0; i < raw.length; i++) { 380 s ~= CigarOperation(raw[i]); 381 } 382 cigar2 = s; 383 } 384 } 385 return cigar2; 386 } 387 388 /// Return human readable sequence fragment - null if 389 /// undefined. Caches sequence. 390 @property Nullable!string sequence() { 391 if (sequence2.isNull) { // is it cached in sequence2? 392 auto raw = _read2.raw_sequence(); 393 if (raw[0] == '*') { 394 assert(raw.length == 1); 395 return sequence2; // null 396 } 397 auto raw_length = (sequence_length + 1) / 2; 398 char[16] convert = "=ACMGRSVTWYHKDBN"; 399 string s; 400 s.reserve(sequence_length); // Heap alloc 401 for (size_t i = 0; i < sequence_length; i++) { 402 auto is_odd = i % 2; 403 auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4); 404 s ~= convert[nuc]; 405 } 406 sequence2 = s; 407 } 408 return sequence2; 409 } 410 411 @property ubyte[] toBlob() { 412 return _read2._data; 413 } 414 415 @property string posString() { 416 return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped"); 417 } 418 419 string toString() { 420 // return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">"; 421 return _read2.get.toString(); 422 } 423 424 } 425 426 /** 427 BamReader2 is used for foreach loops 428 */ 429 430 struct BamReadBlobs { 431 BgzfStream stream; 432 BamHeader header; 433 434 this(string fn) { 435 stream = BgzfStream(fn); 436 } 437 438 int opApply(scope int delegate(ref ReadBlob) dg) { 439 fetch_bam_header(header, stream); 440 // parse the reads 441 while (!stream.eof()) { 442 immutable block_size = stream.read!int(); 443 immutable refid = stream.read!int(); 444 immutable pos = stream.read!int(); 445 446 ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME 447 auto read = ReadBlob(refid,pos,stream.read(data)); 448 dg(read); 449 } 450 return 0; 451 } 452 } 453 454 /** 455 Read streamer - use on single thread only 456 */ 457 458 // import core.memory : pureMalloc; 459 460 struct BamReadBlobStream { 461 BgzfStream stream; 462 BamHeader header; 463 Nullable!ReadBlob readbuf; // points to current read 464 ubyte[] data; // in sync with readbuf 465 466 this(string fn) { 467 stream = BgzfStream(fn); 468 fetch_bam_header(header, stream); 469 if (!empty) 470 popFront(); // preload front 471 } 472 473 bool empty() @property { 474 return stream.eof(); 475 } 476 477 // Returns first read available. If past eof returns null. 478 Nullable!ReadBlob front() { 479 return readbuf; 480 } 481 482 void popFront() { 483 asserte(!empty); // should have been checked for 484 immutable block_size = stream.read!int(); 485 immutable refid = stream.read!int(); 486 immutable pos = stream.read!int(); 487 488 // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness 489 data = new ubyte[block_size-2*int.sizeof]; 490 readbuf = ReadBlob(refid,pos,stream.read(data)); 491 assert(readbuf._data.ptr == data.ptr); 492 } 493 494 } 495 496 /** 497 Reader - use on single thread only 498 499 This one provides peek support. Peek looks one read ahead in the read stream. 500 */ 501 502 struct BamBlobReader { 503 BgzfStream stream; 504 BamHeader header; 505 Nullable!ReadBlob peekbuf; // points to current read 506 // ubyte[] data; // in sync with peekbuf 507 508 this(string fn) { 509 stream = BgzfStream(fn); 510 fetch_bam_header(header, stream); 511 } 512 513 bool empty() @property { 514 return peekbuf.isNull && stream.eof(); 515 } 516 517 Nullable!ReadBlob peek() { 518 if (peekbuf.isNull && !empty) 519 fetch(); 520 return peekbuf; 521 } 522 523 /// Fetches the next read. If the peekbuf is not empty return that 524 /// first and reset peekbuf. 525 Nullable!ReadBlob fetch() { 526 if (!peekbuf.isNull) { 527 auto readbuf = peekbuf; 528 peekbuf = Nullable!ReadBlob(); 529 return readbuf; 530 } 531 asserte(!empty); // should have been checked for 532 immutable block_size = stream.read!int(); 533 immutable refid = stream.read!int(); 534 immutable pos = stream.read!int(); 535 536 // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness 537 auto data = new ubyte[block_size-2*int.sizeof]; 538 peekbuf = ReadBlob(refid,pos,stream.read(data)); 539 return peekbuf; 540 } 541 542 /// Returns the next matching read. Otherwise null 543 /// 544 /// Example: 545 /// 546 /// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped)); 547 /* 548 Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) { 549 while(!empty()) { 550 auto readbuf = read(); 551 if (is_match(R(readbuf))) 552 return readbuf; 553 else 554 return Nullable!ReadBlob(); 555 } 556 return Nullable!ReadBlob(); 557 } 558 */ 559 }