1 /* 2 New style BAM reader. This file is part of Sambamba. 3 Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> 4 5 Sambamba is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 2 of the License, 8 or (at your option) any later version. 9 10 Sambamba is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 18 02111-1307 USA 19 20 */ 21 22 // This is a complete rewrite of Artem Tarasov's original reader. 23 24 module bio2.bam.reader; 25 26 import std.conv; 27 import core.stdc.stdio: fopen, fread, fclose; 28 import std.exception; 29 import std.file; 30 import std.format : format; 31 import std.stdio; 32 import std.string; 33 import std.typecons; 34 import std.bitmanip; 35 36 import bio.bam.cigar; 37 import bio.bam.constants; 38 import bio.core.utils.exception; 39 40 import bio2.bgzf; 41 import bio2.constants; 42 43 import bio2.bam.header; 44 45 template ReadFlags(alias flag) { 46 @property bool is_paired() nothrow { return cast(bool)(flag & 0x1); } 47 /// Each segment properly aligned according to the aligner 48 @property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); } 49 @property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); } 50 @property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); } 51 @property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); } 52 @property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); } 53 @property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); } 54 @property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); } 55 @property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); } 56 @property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); } 57 @property bool is_qc_fail() { 58 assert(is_mapped_raw,to!string(this)); 59 return cast(bool)(flag & 0x200); } 60 alias is_qc_fail failed_quality_control; 61 /// PCR or optical duplicate 62 @property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); } 63 /// Supplementary alignment 64 @property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); } 65 @property string show_flags() { 66 string res = format("b%b-%d",flag,flag); 67 if (is_paired) res ~= " pair"; 68 if (is_proper_pair) res ~= " proper"; 69 if (is_mapped_raw) res ~= " mapped"; 70 if (is_unmapped_raw) res ~= " unmapped"; 71 if (mate_is_unmapped) res ~= " mate_unmapped"; 72 if (is_reverse_strand) res ~= " rev_strand"; 73 if (mate_is_reverse_strand) res ~= " mate_rev_strand"; 74 if (is_first_of_pair) res ~= " first_of_pair"; 75 if (is_second_of_pair) res ~= " second_of_pair"; 76 if (is_secondary_alignment) res ~= " secondary_aln"; 77 if (is_mapped_raw && is_qc_fail) res ~= " qc_fail"; 78 if (is_duplicate) res ~= " duplicate"; 79 if (is_supplementary) res ~= " suppl"; 80 return res; 81 } 82 } 83 84 template CheckMapped(alias refid) { 85 @property nothrow bool is_unmapped() { 86 return is_unmapped_raw; 87 } 88 @property bool is_mapped() { 89 debug { 90 if (is_mapped_raw) { 91 assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec 92 } 93 } 94 return !is_unmapped_raw; 95 } 96 } 97 98 enum Offset { 99 bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24 100 }; 101 102 /** 103 Raw Read buffer containing unparsed data. It should be considered 104 read-only. 105 106 Current implementation is a cluct (class-struct hybrid). The _data 107 pointer is shared when ReadBlob is assigned to another variable 108 (i.e., there is a remote dependency). The advantage is that for 109 each Read data gets allocated on the heap only once. 110 111 All offsets are indexed on init (except for tags). When using 112 fields beyond refid,pos use ProcessReadBlob instead because it 113 caches values. 114 */ 115 116 struct ReadBlob { 117 RefId refid; // -1 is invalid (BAM Spec) 118 GenomePos pos; // 0 coordinate based (BAM spec) 119 private ubyte[] _data; 120 uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max; 121 122 mixin ReadFlags!(_flag); 123 mixin CheckMapped!(refid); 124 125 /* 126 this(RefId ref_id, GenomePos read_pos, ubyte[] buf) { 127 refid = ref_id; 128 pos = read_pos; 129 _data = buf; 130 } 131 */ 132 133 @property void cleanup() { 134 delete _data; 135 _data = null; 136 } 137 138 // Turn ReadBlob into class-struct hybrid or a cluct ;) 139 // @disable this(this); // disable copy semantics; 140 141 @property @trusted nothrow private const T fetch(T)(uint raw_offset) { 142 ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof]; 143 return cast(const(T))buf.read!(T,Endian.littleEndian)(); 144 } 145 146 @property @trusted nothrow private const 147 uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); } 148 @property @trusted nothrow private const 149 uint _flag_nc() { return fetch!uint(Offset.flag_nc); } 150 @property @trusted nothrow private const 151 int sequence_length() { return fetch!int(Offset.l_seq); } 152 @property @trusted nothrow private const 153 int _next_refID() { return fetch!int(Offset.next_refID); } 154 @property @trusted nothrow private const 155 int _next_pos() { return fetch!int(Offset.next_pos); } 156 @property @trusted nothrow private const 157 int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN 158 @property @trusted nothrow private const 159 ushort _bin() { return _bin_mq_nl >> 16; } 160 @property @trusted nothrow private const 161 ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; } 162 @property @trusted nothrow private const 163 ubyte _l_read_name() { return _bin_mq_nl & 0xFF; } 164 @property @trusted nothrow private const 165 ushort _flag() { return fetch!ushort(Offset.flag); } 166 @property @trusted nothrow private const 167 ushort _n_cigar_op() { return _flag_nc & 0xFFFF; } 168 @property @trusted nothrow private const 169 uint _read_name_offset() { return Offset.read_name; } 170 @property @trusted nothrow private 171 uint _cigar_offset() { 172 if (offset_cigar == int.max) 173 offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof); 174 return offset_cigar; 175 } 176 @property @trusted nothrow private 177 uint _seq_offset() { 178 if (offset_seq == int.max) 179 offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof); 180 return offset_seq; 181 } 182 @property @trusted nothrow private 183 uint _qual_offset() { 184 if (offset_qual == int.max) 185 offset_qual = _seq_offset + (sequence_length + 1)/2; 186 return offset_qual; 187 } 188 @property @trusted nothrow private 189 uint _tags_offset() { return _qual_offset + sequence_length; } 190 @property @trusted nothrow private 191 ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; } 192 @property @trusted nothrow private 193 ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; } 194 @property @trusted nothrow private 195 ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; } 196 197 alias sequence_length _l_seq; 198 alias _mapq mapping_quality; // MAPQ 199 200 string toString() { 201 return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">"; 202 } 203 204 } 205 206 /** 207 ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use 208 this when you need to access field/elements multiple times. Note 209 that ProcessReadBlob becomes invalid when ReadBlob goes out of scope. 210 */ 211 struct ProcessReadBlob { 212 private Nullable!ReadBlob _read2; 213 Nullable!int sequence_length2; 214 private Nullable!string sequence2, read_name2; 215 private Nullable!CigarOperations cigar2; 216 private Nullable!GenomePos consumed_reference_bases2; 217 218 mixin ReadFlags!(_flag); 219 mixin CheckMapped!(ref_id); 220 221 this(Nullable!ReadBlob _r) { 222 _read2 = _r; 223 } 224 225 @property void cleanup() { 226 _read2.cleanup; 227 } 228 229 @property nothrow bool isNull() { 230 return _read2.isNull; 231 } 232 233 @property RefId ref_id() { 234 enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2)); 235 return _read2.refid; 236 } 237 238 @property RefId raw_ref_id() { 239 return _read2.refid; 240 } 241 242 @property nothrow uint _flag_nc() { 243 return _read2._flag_nc; 244 } 245 246 @property nothrow ushort _flag() { 247 return _read2._flag; 248 } 249 250 alias ref_id refid; 251 252 /// Get the start position on the reference sequence (better use 253 /// start_loc), i.e., the first base that gets consumed in the 254 /// CIGAR. 255 @property GenomePos start_pos() { 256 assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec 257 asserte(_read2.pos < GenomePos.max); 258 return cast(GenomePos)_read2.pos; 259 } 260 261 @property GenomePos raw_start_pos() { 262 return cast(GenomePos)_read2.pos; 263 } 264 265 /// Get the end position on the reference sequence (better use end_loc) 266 @property GenomePos end_pos() { 267 assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence"); 268 assert(!consumed_reference_bases.isNull); 269 return start_pos + consumed_reference_bases; 270 } 271 272 @property GenomePos raw_end_pos() { 273 return raw_start_pos + consumed_reference_bases; 274 } 275 276 @property GenomeLocation start_loc() { 277 return GenomeLocation(ref_id,start_pos); 278 } 279 280 @property GenomeLocation end_loc() { 281 return GenomeLocation(ref_id,end_pos); 282 } 283 284 @property @trusted MappingQuality mapping_quality() { // MAPQ 285 assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec 286 return MappingQuality(_read2.mapping_quality); 287 } 288 289 @property @trusted int tlen() { // do not use 290 return _read2._tlen; 291 } 292 293 @property @trusted GenomePos sequence_length() { 294 if (sequence_length2.isNull) 295 sequence_length2 = _read2.sequence_length; 296 return sequence_length2; 297 } 298 299 /// Count and caches consumed reference bases. Uses raw_cigar to 300 /// avoid a heap allocation. 301 @property @trusted Nullable!GenomePos consumed_reference_bases() { 302 if (consumed_reference_bases2.isNull) { 303 assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec 304 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 305 auto raw = cast(uint[]) _read2.raw_cigar(); 306 if (raw.length==1 && raw[0] == '*') 307 return consumed_reference_bases2; // null 308 else { 309 GenomePos bases = 0; 310 for (size_t i = 0; i < raw.length; i++) { 311 auto cigarop = CigarOperation(raw[i]); 312 if (cigarop.is_reference_consuming) 313 bases += cigarop.length; 314 } 315 consumed_reference_bases2 = bases; 316 } 317 } 318 return consumed_reference_bases2; 319 } 320 321 /// Count query consumed bases. Uses raw_cigar to avoid a heap 322 /// allocation. 323 @property @trusted GenomePos consumed_query_bases() { 324 assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec 325 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 326 auto raw = cast(uint[]) _read2.raw_cigar(); 327 if (raw.length==1 && raw[0] == '*') 328 return consumed_reference_bases2; // null 329 else { 330 GenomePos bases = 0; 331 for (size_t i = 0; i < raw.length; i++) { 332 auto cigarop = CigarOperation(raw[i]); 333 if (cigarop.is_query_consuming) 334 bases += cigarop.length; 335 } 336 return bases; 337 } 338 } 339 340 /// Return read name as a string. If unavailable returns 341 /// null. Caches name. 342 @property Nullable!string read_name() { 343 if (read_name2.isNull) { 344 assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec 345 auto raw = _read2.read_name; 346 if (raw.length == 0 || (raw.length ==1 && raw[0] == '*')) 347 return read_name2; // null 348 assert(raw.length < 255); // BAM spec 349 if (raw[raw.length-1] == 0) // strip trailing C zero 350 raw.length -= 1; 351 read_name2 = Nullable!string(cast(string)raw); 352 } 353 return read_name2; 354 } 355 356 /// Returns Cigar as an array of operations. Returns null if no 357 /// operations. Caches Cigar when there are operations. 358 @property Nullable!CigarOperations cigar() { 359 if (cigar2.isNull) { 360 assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec 361 assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec 362 auto raw = cast(uint[]) _read2.raw_cigar(); 363 if (raw.length==0 || (raw.length==1 && raw[0] == '*')) 364 return cigar2; // null 365 else { 366 auto s = new CigarOperation[raw.length]; // Heap alloc 367 s.length = 0; 368 for (size_t i = 0; i < raw.length; i++) { 369 s ~= CigarOperation(raw[i]); 370 } 371 cigar2 = s; 372 } 373 } 374 return cigar2; 375 } 376 377 /// Return human readable sequence fragment - null if 378 /// undefined. Caches sequence. 379 @property Nullable!string sequence() { 380 if (sequence2.isNull) { // is it cached in sequence2? 381 auto raw = _read2.raw_sequence(); 382 if (raw[0] == '*') { 383 assert(raw.length == 1); 384 return sequence2; // null 385 } 386 auto raw_length = (sequence_length + 1) / 2; 387 char[16] convert = "=ACMGRSVTWYHKDBN"; 388 string s; 389 s.reserve(sequence_length); // Heap alloc 390 for (size_t i = 0; i < sequence_length; i++) { 391 auto is_odd = i % 2; 392 auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4); 393 s ~= convert[nuc]; 394 } 395 sequence2 = s; 396 } 397 return sequence2; 398 } 399 400 @property ubyte[] toBlob() { 401 return _read2._data; 402 } 403 404 @property string posString() { 405 return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped"); 406 } 407 408 string toString() { 409 // return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">"; 410 return _read2.get.toString(); 411 } 412 413 } 414 415 /** 416 BamReader2 is used for foreach loops 417 */ 418 419 struct BamReadBlobs { 420 BgzfStream stream; 421 BamHeader header; 422 423 this(string fn) { 424 stream = BgzfStream(fn); 425 } 426 427 int opApply(scope int delegate(ref ReadBlob) dg) { 428 fetch_bam_header(header, stream); 429 // parse the reads 430 while (!stream.eof()) { 431 immutable block_size = stream.read!int(); 432 immutable refid = stream.read!int(); 433 immutable pos = stream.read!int(); 434 435 ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME 436 auto read = ReadBlob(refid,pos,stream.read(data)); 437 dg(read); 438 } 439 return 0; 440 } 441 } 442 443 /** 444 Read streamer - use on single thread only 445 */ 446 447 // import core.memory : pureMalloc; 448 449 struct BamReadBlobStream { 450 BgzfStream stream; 451 BamHeader header; 452 Nullable!ReadBlob readbuf; // points to current read 453 ubyte[] data; // in sync with readbuf 454 455 this(string fn) { 456 stream = BgzfStream(fn); 457 fetch_bam_header(header, stream); 458 if (!empty) 459 popFront(); // preload front 460 } 461 462 bool empty() @property { 463 return stream.eof(); 464 } 465 466 // Returns first read available. If past eof returns null. 467 Nullable!ReadBlob front() { 468 return readbuf; 469 } 470 471 void popFront() { 472 asserte(!empty); // should have been checked for 473 immutable block_size = stream.read!int(); 474 immutable refid = stream.read!int(); 475 immutable pos = stream.read!int(); 476 477 // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness 478 data = new ubyte[block_size-2*int.sizeof]; 479 readbuf = ReadBlob(refid,pos,stream.read(data)); 480 assert(readbuf._data.ptr == data.ptr); 481 } 482 483 } 484 485 /** 486 Reader - use on single thread only 487 488 This one provides peek support. Peek looks one read ahead in the read stream. 489 */ 490 491 struct BamBlobReader { 492 BgzfStream stream; 493 BamHeader header; 494 Nullable!ReadBlob peekbuf; // points to current read 495 // ubyte[] data; // in sync with peekbuf 496 497 this(string fn) { 498 stream = BgzfStream(fn); 499 fetch_bam_header(header, stream); 500 } 501 502 bool empty() @property { 503 return peekbuf.isNull && stream.eof(); 504 } 505 506 Nullable!ReadBlob peek() { 507 if (peekbuf.isNull && !empty) 508 fetch(); 509 return peekbuf; 510 } 511 512 /// Fetches the next read. If the peekbuf is not empty return that 513 /// first and reset peekbuf. 514 Nullable!ReadBlob fetch() { 515 if (!peekbuf.isNull) { 516 auto readbuf = peekbuf; 517 peekbuf = Nullable!ReadBlob(); 518 return readbuf; 519 } 520 asserte(!empty); // should have been checked for 521 immutable block_size = stream.read!int(); 522 immutable refid = stream.read!int(); 523 immutable pos = stream.read!int(); 524 525 // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness 526 auto data = new ubyte[block_size-2*int.sizeof]; 527 peekbuf = ReadBlob(refid,pos,stream.read(data)); 528 return peekbuf; 529 } 530 531 /// Returns the next matching read. Otherwise null 532 /// 533 /// Example: 534 /// 535 /// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped)); 536 /* 537 Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) { 538 while(!empty()) { 539 auto readbuf = read(); 540 if (is_match(R(readbuf))) 541 return readbuf; 542 else 543 return Nullable!ReadBlob(); 544 } 545 return Nullable!ReadBlob(); 546 } 547 */ 548 }