1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 /// $(P $(D BamRead) type provides convenient interface for working with SAM/BAM records.) 25 /// 26 /// $(P All flags, tags, and fields can be accessed and modified.) 27 /// 28 /// Examples: 29 /// --------------------------- 30 /// import std.conv; 31 /// ... 32 /// assert(!read.is_unmapped); // check flag 33 /// assert(read.ref_id != -1); // access field 34 /// 35 /// int edit_distance = to!int(read["NM"]); // access tag 36 /// read["NM"] = 0; // modify tag 37 /// read["NM"] = null; // remove tag 38 /// read["NM"] = null; // no-op 39 /// 40 /// foreach (tag, value; read) // iterate over tags 41 /// writeln(tag, " ", value); // and print their keys and values 42 /// 43 /// read.sequence = "AGCAGACTACGTGTGCATAC"; // sets base qualities to 255 44 /// assert(read.base_qualities[0] == 255); 45 /// read.is_unmapped = true; // set flag 46 /// read.ref_id = -1; // set field 47 /// --------------------------- 48 module bio.std.hts.bam.read; 49 50 import bio.core.base; 51 import bio.core.utils.format; 52 53 import bio.std.hts.bam.abstractreader; 54 import bio.std.hts.bam.cigar; 55 import bio.std.hts.bam.writer; 56 import bio.std.hts.bam.tagvalue; 57 import bio.std.hts.bam.bai.bin; 58 59 import bio.std.hts.bam.md.core; 60 61 import bio.std.hts.utils.array; 62 import bio.std.hts.utils.value; 63 import bio.core.utils.switchendianness; 64 65 import bio.std.hts.thirdparty.msgpack : Packer, unpack; 66 67 import std.algorithm; 68 import std.range; 69 import std.conv; 70 import std.format; 71 import std.exception; 72 import std.system; 73 import std.traits; 74 import std.array; 75 import std.bitmanip; 76 import core.stdc.stdlib; 77 78 /** 79 BAM record representation. 80 */ 81 struct BamRead { 82 83 mixin TagStorage; 84 85 /// Reference index in BAM file header 86 @property int ref_id() const nothrow { return _refID; } 87 /// ditto 88 @property void ref_id(int n) { _dup(); _refID = n; } 89 90 /// Reference sequence name ('*' for unmapped reads) 91 @property string ref_name() const nothrow { return _ref_id_to_string(ref_id); } 92 93 /// 0-based leftmost coordinate of the first matching base 94 @property int position() const nothrow { return _pos; } 95 /// ditto 96 @property void position(int n) { _dup(); _pos = n; _recalculate_bin(); } 97 98 /// Indexing bin which this read belongs to. Recalculated when position is changed. 99 @property bio.std.hts.bam.bai.bin.Bin bin() const nothrow { return Bin(_bin); } 100 101 /// Mapping quality. Equals to 255 if not available, otherwise 102 /// equals to rounded -10 * log10(P {mapping position is wrong}). 103 @property ubyte mapping_quality() const nothrow { return _mapq; } 104 /// ditto 105 @property void mapping_quality(ubyte n) { _dup(); _mapq = n; } 106 107 /// Flag bits (should be used on very rare occasions, see flag getters/setters below) 108 @property ushort flag() const nothrow { return _flag; } 109 /// ditto 110 @property void flag(ushort n) { _dup(); _flag = n; } 111 112 /// Sequence length. In fact, sequence.length can be used instead, but that might be 113 /// slower if the compiler is not smart enough to optimize away unrelated stuff. 114 @property int sequence_length() const nothrow { return _l_seq; } 115 116 /// Mate reference ID 117 @property int mate_ref_id() const nothrow { return _next_refID; } 118 /// ditto 119 @property void mate_ref_id(int n) { _dup(); _next_refID = n; } 120 121 /// Mate reference sequence name ('*' for unmapped mates) 122 @property string mate_ref_name() const nothrow { return _ref_id_to_string(_next_refID); } 123 124 /// Mate position 125 @property int mate_position() const nothrow { return _next_pos; } 126 /// ditto 127 @property void mate_position(int n) { _dup(); _next_pos = n; } 128 129 /// Template length 130 @property int template_length() const nothrow { return _tlen; } 131 /// ditto 132 @property void template_length(int n) { _dup(); _tlen = n; } 133 134 // ------------------------ FLAG GETTERS/SETTERS -------------------------------------- // 135 136 /// Template having multiple segments in sequencing 137 @property bool is_paired() const nothrow { return cast(bool)(flag & 0x1); } 138 /// ditto 139 @property void is_paired(bool b) { _setFlag( 0, b); } 140 141 /// Each segment properly aligned according to the aligner 142 @property bool proper_pair() const nothrow { return cast(bool)(flag & 0x2); } 143 /// ditto 144 @property void proper_pair(bool b) { _setFlag( 1, b); } 145 146 /// Segment unmapped 147 @property bool is_unmapped() const nothrow { return cast(bool)(flag & 0x4); } 148 /// ditto 149 @property void is_unmapped(bool b) { _setFlag( 2, b); } 150 151 /// Next segment in the template unmapped 152 @property bool mate_is_unmapped() const nothrow { return cast(bool)(flag & 0x8); } 153 /// ditto 154 @property void mate_is_unmapped(bool b) { _setFlag( 3, b); } 155 156 /// Sequence being reverse complemented 157 @property bool is_reverse_strand() const nothrow { return cast(bool)(flag & 0x10); } 158 /// ditto 159 @property void is_reverse_strand(bool b) { _setFlag( 4, b); } 160 161 /// Sequence of the next segment in the template being reversed 162 @property bool mate_is_reverse_strand() const nothrow { return cast(bool)(flag & 0x20); } 163 /// ditto 164 @property void mate_is_reverse_strand(bool b) { _setFlag( 5, b); } 165 166 /// The first segment in the template 167 @property bool is_first_of_pair() const nothrow { return cast(bool)(flag & 0x40); } 168 /// ditto 169 @property void is_first_of_pair(bool b) { _setFlag( 6, b); } 170 171 /// The last segment in the template 172 @property bool is_second_of_pair() const nothrow { return cast(bool)(flag & 0x80); } 173 /// ditto 174 @property void is_second_of_pair(bool b) { _setFlag( 7, b); } 175 176 /// Secondary alignment 177 @property bool is_secondary_alignment() const nothrow { return cast(bool)(flag & 0x100); } 178 /// ditto 179 @property void is_secondary_alignment(bool b) { _setFlag( 8, b); } 180 181 /// Not passing quality controls 182 @property bool failed_quality_control() const nothrow { return cast(bool)(flag & 0x200); } 183 /// ditto 184 @property void failed_quality_control(bool b) { _setFlag( 9, b); } 185 186 /// PCR or optical duplicate 187 @property bool is_duplicate() const nothrow { return cast(bool)(flag & 0x400); } 188 /// ditto 189 @property void is_duplicate(bool b) { _setFlag(10, b); } 190 191 /// Supplementary alignment 192 @property bool is_supplementary() const nothrow { return cast(bool)(flag & 0x800); } 193 /// ditto 194 @property void is_supplementary(bool b) { _setFlag(11, b); } 195 196 /// Convenience function, returns '+' or '-' indicating the strand. 197 @property char strand() const nothrow { 198 return is_reverse_strand ? '-' : '+'; 199 } 200 201 /// ditto 202 @property void strand(char c) { 203 enforce(c == '-' || c == '+', "Strand must be '-' or '+'"); 204 is_reverse_strand = c == '-'; 205 } 206 207 /// Read name, length must be in 1..255 interval. 208 @property string name() const nothrow { 209 // notice -1: the string is zero-terminated, so we should strip that '\0' 210 return cast(string)(_chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1]); 211 } 212 213 /// ditto 214 @property void name(string new_name) { 215 enforce(new_name.length >= 1 && new_name.length <= 255, 216 "name length must be in 1-255 range"); 217 _dup(); 218 bio.std.hts.utils.array.replaceSlice(_chunk, 219 _chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1], 220 cast(ubyte[])new_name); 221 _l_read_name = cast(ubyte)(new_name.length + 1); 222 } 223 224 /// List of CIGAR operations 225 @property const(CigarOperation)[] cigar() const nothrow { 226 return cast(const(CigarOperation)[])(_chunk[_cigar_offset .. _cigar_offset + 227 _n_cigar_op * CigarOperation.sizeof]); 228 } 229 230 /// ditto 231 @property void cigar(const(CigarOperation)[] c) { 232 enforce(c.length < 65536, "Too many CIGAR operations, must be <= 65535"); 233 _dup(); 234 bio.std.hts.utils.array.replaceSlice(_chunk, 235 _chunk[_cigar_offset .. _cigar_offset + _n_cigar_op * CigarOperation.sizeof], 236 cast(ubyte[])c); 237 238 _n_cigar_op = cast(ushort)(c.length); 239 240 _recalculate_bin(); 241 } 242 243 /// Extended CIGAR where M operators are replaced with =/X based 244 /// on information from MD tag. Throws if the read doesn't have MD 245 /// tag. 246 auto extended_cigar() @property const { 247 Value md = this["MD"]; 248 enforce(md.is_string); 249 return makeExtendedCigar(cigar, mdOperations(*cast(string*)(&md))); 250 } 251 252 /// The number of reference bases covered by this read. 253 /// $(BR) 254 /// Returns 0 if the read is unmapped. 255 int basesCovered() const { 256 257 if (this.is_unmapped) { 258 return 0; // actually, valid alignments should have empty cigar string 259 } 260 261 return reduce!"a + b.length"(0, filter!"a.is_reference_consuming"(cigar)); 262 } 263 264 /// Human-readable representation of CIGAR string (same as in SAM format) 265 string cigarString() const { 266 char[] str; 267 268 // guess size of resulting string 269 str.reserve(_n_cigar_op * 3); 270 271 foreach (cigar_op; cigar) { 272 str ~= to!string(cigar_op.length); 273 str ~= cigar_op.type; 274 } 275 return cast(string)str; 276 } 277 278 private @property inout(ubyte)[] raw_sequence_data() inout nothrow { 279 return _chunk[_seq_offset .. _seq_offset + (_l_seq + 1) / 2]; 280 } 281 282 /// Read-only random-access range for access to sequence data. 283 static struct SequenceResult { 284 285 private size_t _index; 286 private ubyte[] _data = void; 287 private size_t _len = void; 288 private bool _use_first_4_bits = void; 289 290 this(const(ubyte[]) data, size_t len, bool use_first_4_bits=true) { 291 _data = cast(ubyte[])data; 292 _len = len; 293 _use_first_4_bits = use_first_4_bits; 294 } 295 296 /// 297 @property bool empty() const { 298 return _index >= _len; 299 } 300 301 /// 302 @property bio.core.base.Base front() const { 303 return opIndex(0); 304 } 305 306 /// 307 @property bio.core.base.Base back() const { 308 return opIndex(_len - 1); 309 } 310 311 /* 312 I have no fucking idea why this tiny piece of code 313 does NOT get inlined by stupid DMD compiler. 314 315 Therefore I use string mixin instead. 316 (hell yeah! Back to the 90s! C macros rulez!) 317 318 private size_t _getActualPosition(size_t index) const 319 { 320 if (_use_first_4_bits) { 321 // [0 1] [2 3] [4 5] [6 7] ... 322 // | 323 // V 324 // 0 1 2 3 325 return index >> 1; 326 } else { 327 // [. 0] [1 2] [3 4] [5 6] ... 328 // | 329 // V 330 // 0 1 2 3 331 return (index >> 1) + (index & 1); 332 } 333 }*/ 334 335 private static string _getActualPosition(string index) { 336 return "((" ~ index ~") >> 1) + " ~ 337 "(_use_first_4_bits ? 0 : ((" ~ index ~ ") & 1))"; 338 } 339 340 private bool _useFirst4Bits(size_t index) const 341 { 342 auto res = index % 2 == 0; 343 if (!_use_first_4_bits) { 344 res = !res; 345 } 346 return res; 347 } 348 349 /// 350 @property SequenceResult save() const { 351 return SequenceResult(_data[mixin(_getActualPosition("_index")) .. $], 352 _len - _index, 353 _useFirst4Bits(_index)); 354 } 355 356 /// 357 SequenceResult opSlice(size_t i, size_t j) const { 358 return SequenceResult(_data[mixin(_getActualPosition("_index + i")) .. $], 359 j - i, 360 _useFirst4Bits(_index + i)); 361 } 362 363 /// 364 @property bio.core.base.Base opIndex(size_t i) const { 365 auto pos = _index + i; 366 367 if (_use_first_4_bits) 368 { 369 if (pos & 1) 370 return Base.fromInternalCode(_data[pos >> 1] & 0xF); 371 else 372 return Base.fromInternalCode(_data[pos >> 1] >> 4); 373 } 374 else 375 { 376 if (pos & 1) 377 return Base.fromInternalCode(_data[(pos >> 1) + 1] >> 4); 378 else 379 return Base.fromInternalCode(_data[pos >> 1] & 0xF); 380 } 381 382 assert(false); 383 } 384 385 /// ditto 386 @property void opIndexAssign(bio.core.base.Base base, size_t i) { 387 auto pos = _index + i; 388 389 if (_use_first_4_bits) 390 { 391 if (pos & 1) 392 _data[pos >> 1] &= 0xF0, _data[pos >> 1] |= base.internal_code; 393 else 394 _data[pos >> 1] &= 0x0F, _data[pos >> 1] |= base.internal_code << 4; 395 } 396 else 397 { 398 if (pos & 1) 399 _data[(pos >> 1) + 1] &= 0x0F, _data[(pos >> 1) + 1] |= base.internal_code << 4; 400 else 401 _data[pos >> 1] &= 0xF0, _data[pos >> 1] |= base.internal_code; 402 } 403 } 404 405 406 /// 407 void popFront() { 408 ++_index; 409 } 410 411 /// 412 void popBack() { 413 --_len; 414 } 415 416 /// 417 @property size_t length() const { 418 return _len - _index; 419 } 420 421 alias length opDollar; 422 423 void toString(scope void delegate(const(char)[]) dg) const { 424 char[256] buf = void; 425 size_t total = this.length; 426 size_t written = 0; 427 while (written < total) { 428 size_t n = min(buf.length, total - written); 429 foreach (j; 0 .. n) 430 buf[j] = opIndex(written + j).asCharacter; 431 dg(buf[0 .. n]); 432 written += n; 433 } 434 } 435 } 436 437 /// Random-access range of characters 438 @property SequenceResult sequence() const { 439 return SequenceResult(raw_sequence_data, sequence_length); 440 } 441 442 static assert(isRandomAccessRange!(ReturnType!sequence)); 443 444 /// Sets query sequence. Sets all base qualities to 255 (i.e. unknown). 445 @property void sequence(string seq) 446 { 447 _dup(); 448 449 auto raw_length = (seq.length + 1) / 2; 450 // set sequence 451 auto replacement = uninitializedArray!(ubyte[])(raw_length + seq.length); 452 replacement[raw_length .. $] = 0xFF; 453 for (size_t i = 0; i < raw_length; ++i) { 454 replacement[i] = cast(ubyte)(Base(seq[2 * i]).internal_code << 4); 455 456 if (seq.length > 2 * i + 1) 457 replacement[i] |= cast(ubyte)(Base(seq[2 * i + 1]).internal_code); 458 } 459 460 bio.std.hts.utils.array.replaceSlice(_chunk, 461 _chunk[_seq_offset .. _tags_offset], 462 replacement); 463 464 _l_seq = cast(int)seq.length; 465 } 466 467 /// Quality data (phred-based scores) 468 @property inout(ubyte)[] base_qualities() inout nothrow { 469 return _chunk[_qual_offset .. _qual_offset + _l_seq * char.sizeof]; 470 } 471 472 /// Set quality data - array length must be of the same length as the sequence. 473 @property void base_qualities(const(ubyte)[] quality) { 474 enforce(quality.length == _l_seq, "Quality data must be of the same length as sequence"); 475 _dup(); 476 _chunk[_qual_offset .. _qual_offset + _l_seq] = quality; 477 } 478 479 /* 480 Constructs the struct from memory chunk 481 */ 482 this(ubyte[] chunk, bool fix_byte_order=true) { 483 484 // Switching endianness lazily is not a good idea: 485 // 486 // 1) switching byte order is pretty fast 487 // 2) lazy switching for arrays can kill the performance, 488 // it has to be done once 489 // 3) the code will be too complicated, whereas there're 490 // not so many users of big-endian systems 491 // 492 // In summa, BAM is little-endian format, so big-endian 493 // users will suffer anyway, it's unavoidable. 494 495 _chunk = chunk; 496 this._is_slice = true; 497 498 if (fix_byte_order && std.system.endian != Endian.littleEndian) { 499 switchChunkEndianness(); 500 501 // Dealing with tags is the responsibility of TagStorage. 502 fixTagStorageByteOrder(); 503 } 504 } 505 506 // Doesn't touch tags, only fields. 507 // @@@TODO: NEEDS TESTING@@@ 508 private void switchChunkEndianness() { 509 // First 8 fields are 32-bit integers: 510 // 511 // 0) refID int 512 // 1) pos int 513 // 2) bin_mq_nl uint 514 // 3) flag_nc uint 515 // 4) l_seq int 516 // 5) next_refID int 517 // 6) next_pos int 518 // 7) tlen int 519 // ---------------------------------------------------- 520 // (after them name follows which is string) 521 // 522 switchEndianness(_chunk.ptr, 8 * uint.sizeof); 523 524 // Then we need to switch endianness of CIGAR data: 525 switchEndianness(_chunk.ptr + _cigar_offset, 526 _n_cigar_op * uint.sizeof); 527 } 528 529 private size_t calculateChunkSize(string read_name, 530 string sequence, 531 in CigarOperation[] cigar) 532 { 533 return 8 * int.sizeof 534 + (read_name.length + 1) // tailing '\0' 535 + uint.sizeof * cigar.length 536 + ubyte.sizeof * ((sequence.length + 1) / 2) 537 + ubyte.sizeof * sequence.length; 538 } 539 540 /// Construct alignment from basic information about it. 541 /// 542 /// Other fields can be set afterwards. 543 this(string read_name, // info for developers: 544 string sequence, // these 3 fields are needed 545 in CigarOperation[] cigar) // to calculate size of _chunk 546 { 547 enforce(read_name.length < 256, "Too long read name, length must be <= 255"); 548 enforce(cigar.length < 65536, "Too many CIGAR operations, must be <= 65535"); 549 550 if (this._chunk is null) { 551 this._chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)]; 552 } 553 554 this._refID = -1; // set default values 555 this._pos = -1; // according to SAM/BAM 556 this._mapq = 255; // specification 557 this._next_refID = -1; 558 this._next_pos = -1; 559 this._tlen = 0; 560 561 this._l_read_name = cast(ubyte)(read_name.length + 1); // tailing '\0' 562 this._n_cigar_op = cast(ushort)(cigar.length); 563 this._l_seq = cast(int)(sequence.length); 564 565 // now all offsets can be calculated through corresponding properties 566 567 // set default quality 568 _chunk[_qual_offset .. _qual_offset + sequence.length] = 0xFF; 569 570 // set CIGAR data 571 auto _len = cigar.length * CigarOperation.sizeof; 572 _chunk[_cigar_offset .. _cigar_offset + _len] = cast(ubyte[])(cigar); 573 574 // set read_name 575 auto _offset = _read_name_offset; 576 _chunk[_offset .. _offset + read_name.length] = cast(ubyte[])read_name; 577 _chunk[_offset + read_name.length] = cast(ubyte)'\0'; 578 579 this._is_slice = false; 580 581 this.sequence = sequence; 582 } 583 584 /// Deep copy of the record. 585 BamRead dup() @property const { 586 BamRead result; 587 result._chunk = this._chunk.dup; 588 result._is_slice = false; 589 result._modify_in_place = false; 590 result._reader = cast()_reader; 591 return result; 592 } 593 594 /// Compare two alignments, including tags 595 /// (the tags must follow in the same order for equality). 596 bool opEquals(BamRead other) const pure nothrow { 597 // don't forget about _is_slice trick 598 auto m = _cigar_offset; 599 return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] && 600 _chunk[m .. $] == other._chunk[m .. $]; 601 } 602 603 bool opEquals(const(BamRead) other) const pure nothrow { 604 return opEquals(cast()other); 605 } 606 607 /// Size of the alignment record when output to stream in BAM format. 608 /// Includes block_size as well (see SAM/BAM specification) 609 @property size_t size_in_bytes() const { 610 return int.sizeof + _chunk.length; 611 } 612 613 package void write(BamWriter writer) { 614 writer.writeInteger(cast(int)(_chunk.length)); 615 616 ubyte old_byte = _chunk[_cigar_offset - 1]; 617 _chunk[_cigar_offset - 1] = 0; 618 619 if (std.system.endian != Endian.littleEndian) { 620 switchChunkEndianness(); 621 writer.writeByteArray(_chunk[0 .. _tags_offset]); 622 switchChunkEndianness(); 623 } else { 624 writer.writeByteArray(_chunk[0 .. _tags_offset]); 625 } 626 627 _chunk[_cigar_offset - 1] = old_byte; 628 629 writeTags(writer); 630 } 631 632 /// Packs message in the following format: 633 /// $(BR) 634 /// MsgPack array with elements 635 /// $(OL 636 /// $(LI name - string) 637 /// $(LI flag - ushort) 638 /// $(LI reference sequence id - int) 639 /// $(LI leftmost mapping position (1-based) - int) 640 /// $(LI mapping quality - ubyte) 641 /// $(LI array of CIGAR operation lengths - int[]) 642 /// $(LI array of CIGAR operation types - ubyte[]) 643 /// $(LI mate reference sequence id - int) 644 /// $(LI mate position (1-based) - int) 645 /// $(LI template length - int) 646 /// $(LI segment sequence - string) 647 /// $(LI phred-base quality - ubyte[]) 648 /// $(LI tags - map: string -> value)) 649 void toMsgpack(Packer)(ref Packer packer) const { 650 packer.beginArray(13); 651 packer.pack(cast(ubyte[])name); 652 packer.pack(flag); 653 packer.pack(ref_id); 654 packer.pack(position + 1); 655 packer.pack(mapping_quality); 656 packer.pack(array(map!"a.length"(cigar))); 657 packer.pack(array(map!"a.type"(cigar))); 658 packer.pack(mate_ref_id); 659 packer.pack(mate_position); 660 packer.pack(template_length); 661 packer.pack(to!string(sequence)); 662 packer.pack(base_qualities); 663 664 packer.beginMap(tagCount()); 665 foreach (key, value; this) { 666 packer.pack(key); 667 packer.pack(value); 668 } 669 } 670 671 /// String representation. 672 /// $(BR) 673 /// Possible formats are SAM ("%s") and JSON ("%j") 674 void toString(scope void delegate(const(char)[]) sink, FormatSpec!char fmt) const { 675 if (size_in_bytes < 10000 && fmt.spec == 's') { 676 auto p = cast(char*)alloca(size_in_bytes * 5); 677 char* end = p; 678 toSam(end); 679 sink(p[0 .. end - p]); 680 } else if (size_in_bytes < 5000 && fmt.spec == 'j') { 681 auto p = cast(char*)alloca(size_in_bytes * 10 + 1000); 682 char* end = p; 683 toJson(end); 684 sink(p[0 .. end - p]); 685 } else if (fmt.spec == 's') { 686 toSam(sink); 687 } else if (fmt.spec == 'j') { 688 toJson(sink); 689 } else { 690 throw new FormatException("unknown format specifier"); 691 } 692 } 693 694 /// ditto 695 void toSam(Sink)(auto ref Sink sink) const 696 if (isSomeSink!Sink) 697 { 698 sink.write(name); 699 sink.write('\t'); 700 sink.write(flag); 701 sink.write('\t'); 702 if (ref_id == -1 || _reader is null) 703 sink.write('*'); 704 else 705 sink.write(_reader.reference_sequences[ref_id].name); 706 707 sink.write('\t'); 708 sink.write(position + 1); 709 sink.write('\t'); 710 sink.write(mapping_quality); 711 sink.write('\t'); 712 713 if (cigar.length == 0) 714 sink.write('*'); 715 else 716 foreach (op; cigar) 717 op.toSam(sink); 718 719 sink.write('\t'); 720 721 if (mate_ref_id == ref_id) { 722 if (mate_ref_id == -1) 723 sink.write("*\t"); 724 else 725 sink.write("=\t"); 726 } else { 727 if (mate_ref_id == -1 || _reader is null) { 728 sink.write("*\t"); 729 } else { 730 auto mate_name = _reader.reference_sequences[mate_ref_id].name; 731 sink.write(mate_name); 732 sink.write("\t"); 733 } 734 } 735 736 sink.write(mate_position + 1); 737 sink.write('\t'); 738 sink.write(template_length); 739 sink.write('\t'); 740 741 if (sequence_length == 0) 742 sink.write('*'); 743 else 744 foreach (char c; sequence) 745 sink.write(c); 746 sink.write('\t'); 747 748 if (base_qualities.length == 0 || base_qualities[0] == 0xFF) 749 sink.write('*'); 750 else 751 foreach (qual; base_qualities) 752 sink.write(cast(char)(qual + 33)); 753 754 foreach (k, v; this) { 755 sink.write('\t'); 756 sink.write(k); 757 sink.write(':'); 758 v.toSam(sink); 759 } 760 } 761 762 /// ditto 763 string toSam()() const { 764 return to!string(this); 765 } 766 767 /// JSON representation 768 void toJson(Sink)(auto ref Sink sink) const 769 if (isSomeSink!Sink) 770 { 771 sink.write(`{"qname":`); sink.writeJson(name); 772 sink.write(`,"flag":`); sink.write(flag); 773 774 sink.write(`,"rname":`); 775 if (ref_id == -1 || _reader is null) 776 sink.write(`"*"`); 777 else 778 sink.writeJson(_reader.reference_sequences[ref_id].name); 779 780 sink.write(`,"pos":`); sink.write(position + 1); 781 sink.write(`,"mapq":`); sink.write(mapping_quality); 782 783 sink.write(`,"cigar":"`); 784 if (cigar.empty) 785 sink.write('*'); 786 else 787 foreach (op; cigar) 788 op.toSam(sink); 789 sink.write('"'); 790 791 sink.write(`,"rnext":`); 792 if (mate_ref_id == ref_id) { 793 if (mate_ref_id == -1) 794 sink.write(`"*"`); 795 else 796 sink.write(`"="`); 797 } else if (mate_ref_id == -1 || _reader is null) { 798 sink.write(`"*"`); 799 } else { 800 sink.writeJson(_reader.reference_sequences[mate_ref_id].name); 801 } 802 803 sink.write(`,"pnext":`); sink.write(mate_position + 1); 804 sink.write(`,"tlen":`); sink.write(template_length); 805 806 sink.write(`,"seq":"`); 807 if (sequence_length == 0) 808 sink.write('*'); 809 else 810 foreach (char c; sequence) 811 sink.write(c); 812 sink.write('"'); 813 814 sink.write(`,"qual":`); 815 sink.writeJson(base_qualities); 816 817 sink.write(`,"tags":{`); 818 819 bool not_first = false; 820 foreach (k, v; this) { 821 if (not_first) 822 sink.write(','); 823 sink.writeJson(k); 824 sink.write(':'); 825 v.toJson(sink); 826 not_first = true; 827 } 828 829 sink.write("}}"); 830 } 831 832 /// ditto 833 string toJson()() const { 834 auto w = appender!(char[])(); 835 toJson((const(char)[] s) { w.put(s); }); 836 return cast(string)w.data; 837 } 838 839 /// Associates read with BAM reader. This is done automatically 840 /// if this read is obtained through BamReader/Reference methods. 841 void associateWithReader(bio.std.hts.bam.abstractreader.IBamSamReader reader) { 842 _reader = reader; 843 } 844 845 /// Associated BAM/SAM reader. 846 inout(bio.std.hts.bam.abstractreader.IBamSamReader) reader() @property inout { 847 return _reader; 848 } 849 850 /// 851 bool is_slice_backed() @property const { 852 return _is_slice; 853 } 854 855 /// Raw representation of the read. Occasionally useful for dirty hacks! 856 inout(ubyte)[] raw_data() @property inout { 857 return _chunk; 858 } 859 860 /// ditto 861 void raw_data(ubyte[] data) @property { 862 _chunk = data; 863 } 864 865 package ubyte[] _chunk; // holds all the data, 866 // the access is organized via properties 867 // (see below) 868 869 private: 870 871 // by specs, name ends with '\0' 872 // let's use this byte for something useful! 873 // 874 // (Of course this places some restrictions on usage, 875 // but allows to reduce size of record.) 876 bool _is_slice() @property const { 877 return cast(bool)(_chunk[_cigar_offset - 1] & 1); 878 } 879 880 void _is_slice(bool is_slice) @property { 881 _chunk[_cigar_offset - 1] &= 0b11111110; 882 _chunk[_cigar_offset - 1] |= (is_slice ? 1 : 0); 883 } 884 885 // don't call _dup() if the record is modified 886 bool _modify_in_place() @property const { 887 return cast(bool)(_chunk[_cigar_offset - 1] & 2); 888 } 889 890 void _modify_in_place(bool in_place) @property { 891 _chunk[_cigar_offset - 1] &= 0b11111101; 892 _chunk[_cigar_offset - 1] |= (in_place ? 2 : 0); 893 } 894 895 IBamSamReader _reader; 896 897 string _ref_id_to_string(int ref_id) const nothrow { 898 if (_reader is null) 899 return "?"; 900 if (ref_id < 0) 901 return "*"; 902 return _reader.reference_sequences[ref_id].name; 903 } 904 905 // Official field names from SAM/BAM specification. 906 // For internal use only 907 @property int _refID() const nothrow { 908 return *(cast( int*)(_chunk.ptr + int.sizeof * 0)); 909 } 910 911 @property int _pos() const nothrow { 912 return *(cast( int*)(_chunk.ptr + int.sizeof * 1)); 913 } 914 915 @property uint _bin_mq_nl() const nothrow pure @system { 916 return *(cast(uint*)(_chunk.ptr + int.sizeof * 2)); 917 } 918 919 @property uint _flag_nc() const nothrow { 920 return *(cast(uint*)(_chunk.ptr + int.sizeof * 3)); 921 } 922 923 @property int _l_seq() const nothrow { 924 return *(cast( int*)(_chunk.ptr + int.sizeof * 4)); 925 } 926 927 @property int _next_refID() const nothrow { 928 return *(cast( int*)(_chunk.ptr + int.sizeof * 5)); 929 } 930 931 @property int _next_pos() const nothrow { 932 return *(cast( int*)(_chunk.ptr + int.sizeof * 6)); 933 } 934 935 @property int _tlen() const nothrow { 936 return *(cast( int*)(_chunk.ptr + int.sizeof * 7)); 937 } 938 939 // Setters, also only for internal use 940 @property void _refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 0)) = n; } 941 @property void _pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 1)) = n; } 942 @property void _bin_mq_nl(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 2)) = n; } 943 @property void _flag_nc(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 3)) = n; } 944 @property void _l_seq(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 4)) = n; } 945 @property void _next_refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 5)) = n; } 946 @property void _next_pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 6)) = n; } 947 @property void _tlen(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 7)) = n; } 948 949 // Additional useful properties, also from SAM/BAM specification 950 // 951 // The layout of bin_mq_nl and flag_nc is as follows 952 // (upper bits -------> lower bits): 953 // 954 // bin_mq_nl [ { bin (16b) } { mapping quality (8b) } { read name length (8b) } ] 955 // 956 // flag_nc [ { flag (16b) } { n_cigar_op (16b) } ] 957 // 958 @property ushort _bin() const nothrow { 959 return _bin_mq_nl >> 16; 960 } 961 @property ubyte _mapq() const nothrow { 962 return (_bin_mq_nl >> 8) & 0xFF; 963 } 964 @property ubyte _l_read_name() const nothrow pure { 965 return _bin_mq_nl & 0xFF; 966 } 967 @property ushort _flag() const nothrow { 968 return _flag_nc >> 16; 969 } 970 @property ushort _n_cigar_op() const nothrow { 971 return _flag_nc & 0xFFFF; 972 } 973 974 // Setters for those properties 975 @property void _bin(ushort n) { _bin_mq_nl = (_bin_mq_nl & 0xFFFF) | (n << 16); } 976 @property void _mapq(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF00) | (n << 8); } 977 @property void _l_read_name(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF ) | n; } 978 @property void _flag(ushort n) { _flag_nc = (_flag_nc & 0xFFFF) | (n << 16); } 979 @property void _n_cigar_op(ushort n) { _flag_nc = (_flag_nc & ~0xFFFF) | n; } 980 981 // Offsets of various arrays in bytes. 982 // Currently, are computed each time, so if speed will be an issue, 983 // they can be made fields instead of properties. 984 @property size_t _read_name_offset() const nothrow pure { 985 return 8 * int.sizeof; 986 } 987 988 @property size_t _cigar_offset() const nothrow pure { 989 return _read_name_offset + _l_read_name * char.sizeof; 990 } 991 992 @property size_t _seq_offset() const nothrow { 993 return _cigar_offset + _n_cigar_op * uint.sizeof; 994 } 995 996 @property size_t _qual_offset() const nothrow { 997 return _seq_offset + (_l_seq + 1) / 2; 998 } 999 1000 // Offset of auxiliary data 1001 @property size_t _tags_offset() const nothrow { 1002 return _qual_offset + _l_seq; 1003 } 1004 1005 // Sets n-th flag bit to boolean value b. 1006 void _setFlag(int n, bool b) { 1007 assert(n < 16); 1008 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching 1009 ushort mask = cast(ushort)(1 << n); 1010 _flag = (_flag & ~cast(int)(mask)) | ((-cast(int)b) & mask); 1011 } 1012 1013 // If _chunk is still a slice, not an array, duplicate it. 1014 // Used when some part of alignment record is modified by user. 1015 // 1016 // Basically, it's sort of copy-on-write: a lot of read-only alignments 1017 // may point to the same location, but every modified one allocates its 1018 // own chunk of memory. 1019 void _dup() { 1020 if (_is_slice && !_modify_in_place) { 1021 _chunk = _chunk.dup; 1022 _is_slice = false; 1023 } 1024 } 1025 1026 public: 1027 // Calculates bin number. 1028 void _recalculate_bin() { 1029 _bin = reg2bin(position, position + basesCovered()); 1030 } 1031 } 1032 1033 1034 /// Lazy tag storage. 1035 /// 1036 /// Provides hash-like access and opportunity to iterate 1037 /// storage like an associative array. 1038 mixin template TagStorage() { 1039 1040 // Provides access to chunk of memory which contains tags. 1041 // This way, every time _tags_offset gets updated 1042 // (due to update of cigar string/read name/sequence and memory move), 1043 // the change is reflected automatically in tag storage. 1044 private @property const(ubyte)[] _tags_chunk() const { 1045 return _chunk[_tags_offset .. $]; 1046 } 1047 1048 /// Hash-like access to tags. Time complexity is $(BIGOH number of tags). 1049 /// $(BR) 1050 /// If tag with such $(I key) is not found, returned value 'is nothing'. 1051 /// $(BR) 1052 /// If key length is different from 2, exception is thrown. 1053 /// $(BR) 1054 /// Special case when $(I value) represents nothing is used for removing tag 1055 /// (assuming that no more than one with this key is presented in the record). 1056 /// 1057 /// Examples: 1058 /// ---------------------------- 1059 /// auto v = read["NM"]; 1060 /// assert(v.is_integer); 1061 /// 1062 /// auto v = read["MN"]; 1063 /// assert(v.is_nothing); // no such tag 1064 /// 1065 /// read["NM"] = 3; // converted to bio.std.hts.bam.tagvalue.Value implicitly 1066 /// 1067 /// read["NM"] = null; // removes tag 1068 /// assert(read["NM"].is_nothing); 1069 /// ---------------------------- 1070 bio.std.hts.bam.tagvalue.Value opIndex(string key) const { 1071 enforce(key.length == 2, "Key length must be 2"); 1072 auto __tags_chunk = _tags_chunk; // _tags_chunk is evaluated lazily 1073 if (__tags_chunk.length < 4) 1074 return Value(null); 1075 1076 size_t offset = 0; 1077 while (offset + 1 < __tags_chunk.length) { 1078 if (__tags_chunk[offset .. offset + 2] == key) { 1079 offset += 2; 1080 return readValue(offset, __tags_chunk); 1081 } else { 1082 offset += 2; 1083 skipValue(offset, __tags_chunk); 1084 } 1085 } 1086 return Value(null); 1087 } 1088 1089 /// ditto 1090 void opIndexAssign(T)(T value, string key) 1091 if (is(T == Value) || __traits(compiles, GetTypeId!T)) 1092 { 1093 static if(is(T == Value)) { 1094 enforce(key.length == 2, "Key length must be 2"); 1095 auto __tags_chunk = _tags_chunk; 1096 1097 _dup(); 1098 1099 size_t offset = 0; 1100 while (offset + 1 < __tags_chunk.length) { 1101 if (__tags_chunk[offset .. offset + 2] == key) { 1102 if (value.is_nothing) { 1103 // special case - remove tag 1104 removeValueAt(offset); 1105 } else { 1106 replaceValueAt(offset + 2, value); 1107 } 1108 return; 1109 } else { 1110 offset += 2; 1111 skipValue(offset, __tags_chunk); 1112 } 1113 } 1114 1115 if (!value.is_nothing) 1116 appendTag(key, value); 1117 } else { 1118 opIndexAssign(Value(value), key); 1119 } 1120 } 1121 1122 /// Append new tag to the end, skipping check if it already exists. $(BIGOH 1) 1123 void appendTag(string key, Value value) { 1124 auto oldlen = _chunk.length; 1125 _chunk.length = _chunk.length + sizeInBytes(value) + 2 * char.sizeof; 1126 _chunk[oldlen .. oldlen + 2] = cast(ubyte[])key; 1127 emplaceValue(_chunk.ptr + oldlen + 2, value); 1128 } 1129 1130 /// Remove all tags 1131 void clearAllTags() { 1132 _chunk.length = _tags_offset; 1133 } 1134 1135 /// Number of tags. $(BIGOH number of tags) 1136 size_t tagCount() { 1137 size_t result = 0; 1138 size_t offset = 0; 1139 auto __tags_chunk = _tags_chunk; 1140 while (offset + 1 < __tags_chunk.length) { 1141 offset += 2; 1142 skipValue(offset, __tags_chunk); 1143 result += 1; 1144 } 1145 return result; 1146 } 1147 1148 // replace existing tag 1149 private void replaceValueAt(size_t offset, Value value) { 1150 // offset points to the beginning of the value 1151 auto begin = offset; 1152 auto __tags_chunk = _tags_chunk; 1153 skipValue(offset, __tags_chunk); // now offset is updated and points to the end 1154 auto end = offset; 1155 1156 prepareSlice(_chunk, __tags_chunk[begin .. end], sizeInBytes(value)); 1157 1158 emplaceValue(_chunk.ptr + _tags_offset + begin, value); 1159 } 1160 1161 // remove existing tag 1162 private void removeValueAt(size_t begin) { 1163 // offset points to the beginning of the value 1164 auto offset = begin + 2; 1165 auto __tags_chunk = _tags_chunk; 1166 skipValue(offset, __tags_chunk); 1167 auto end = offset; 1168 // this does the job (see prepareSlice code) 1169 prepareSlice(_chunk, __tags_chunk[begin .. end], 0); 1170 } 1171 1172 /// Provides opportunity to iterate over tags. 1173 int opApply(scope int delegate(const ref string k, const ref Value v) dg) const { 1174 size_t offset = 0; 1175 auto __tags_chunk = _tags_chunk; 1176 while (offset + 1 < __tags_chunk.length) { 1177 auto key = cast(string)__tags_chunk[offset .. offset + 2]; 1178 offset += 2; 1179 auto val = readValue(offset, __tags_chunk); 1180 auto res = dg(key, val); 1181 if (res != 0) { 1182 return res; 1183 } 1184 } 1185 return 0; 1186 } 1187 1188 /// Returns the number of tags. Time complexity is $(BIGOH number of tags) 1189 size_t tagCount() const { 1190 size_t res = 0; 1191 size_t offset = 0; 1192 auto __tags_chunk = _tags_chunk; 1193 while (offset + 1 < __tags_chunk.length) { 1194 offset += 2; 1195 skipValue(offset, __tags_chunk); 1196 res += 1; 1197 } 1198 return res; 1199 } 1200 1201 private void writeTags(BamWriter writer) { 1202 if (std.system.endian == Endian.littleEndian) { 1203 writer.writeByteArray(_tags_chunk[]); 1204 } else { 1205 fixTagStorageByteOrder(); 1206 writer.writeByteArray(_tags_chunk[]); 1207 fixTagStorageByteOrder(); 1208 } 1209 } 1210 1211 // Reads value which starts from (_tags_chunk.ptr + offset) address, 1212 // and updates offset to the end of value. O(1) 1213 private Value readValue(ref size_t offset, const(ubyte)[] tags_chunk) const { 1214 char type = cast(char)tags_chunk[offset++]; 1215 return readValueFromArray(type, tags_chunk, offset); 1216 } 1217 1218 // Increases offset so that it points to the next value. O(1). 1219 private void skipValue(ref size_t offset, const(ubyte)[] tags_chunk) const { 1220 char type = cast(char)tags_chunk[offset++]; 1221 if (type == 'Z' || type == 'H') { 1222 while (tags_chunk[offset++] != 0) {} 1223 } else if (type == 'B') { 1224 char elem_type = cast(char)tags_chunk[offset++]; 1225 auto length = *(cast(uint*)(tags_chunk.ptr + offset)); 1226 offset += uint.sizeof + charToSizeof(elem_type) * length; 1227 } else { 1228 offset += charToSizeof(type); 1229 } 1230 } 1231 1232 /* 1233 Intended to be used in constructor for initial endianness fixing 1234 in case the library is used on big-endian system. 1235 1236 NOT TESTED AT ALL!!! 1237 */ 1238 private void fixTagStorageByteOrder() { 1239 /* TODO: TEST ON BIG-ENDIAN SYSTEM!!! */ 1240 const(ubyte)* p = _tags_chunk.ptr; 1241 const(ubyte)* end = p + _chunk.length; 1242 while (p < end) { 1243 p += 2; // skip tag name 1244 char type = *(cast(char*)p); 1245 ++p; // skip type 1246 if (type == 'Z' || type == 'H') { 1247 while (*p != 0) { // zero-terminated 1248 ++p; // string 1249 } 1250 ++p; // skip '\0' 1251 } else if (type == 'B') { // array 1252 char elem_type = *(cast(char*)p); 1253 uint size = charToSizeof(elem_type); 1254 switchEndianness(p, uint.sizeof); 1255 uint length = *(cast(uint*)p); 1256 p += uint.sizeof; // skip length 1257 if (size != 1) { 1258 for (auto j = 0; j < length; j++) { 1259 switchEndianness(p, size); 1260 p += size; 1261 } 1262 } else { 1263 // skip 1264 p += length; 1265 } 1266 } else { 1267 uint size = charToSizeof(type); 1268 if (size != 1) { 1269 switchEndianness(p, size); 1270 p += size; 1271 } else { 1272 ++p; 1273 } 1274 } 1275 } 1276 } 1277 } 1278 1279 unittest { 1280 1281 import std.algorithm; 1282 import std.stdio; 1283 import std.math; 1284 1285 // stderr.writeln("Testing BamRead behaviour..."); 1286 auto read = BamRead("readname", 1287 "AGCTGACTACGTAATAGCCCTA", 1288 [CigarOperation(22, 'M')]); 1289 assert(read.sequence_length == 22); 1290 assert(read.cigar.length == 1); 1291 assert(read.cigarString() == "22M"); 1292 assert(read.name == "readname"); 1293 assert(equal(read.sequence(), "AGCTGACTACGTAATAGCCCTA")); 1294 1295 read.name = "anothername"; 1296 assert(read.name == "anothername"); 1297 assert(read.cigarString() == "22M"); 1298 1299 read.base_qualities = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1300 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]; 1301 assert(reduce!"a+b"(0, read.base_qualities) == 253); 1302 1303 read["RG"] = 15; 1304 assert(read["RG"] == 15); 1305 1306 read["X1"] = [1, 2, 3, 4, 5]; 1307 assert(read["X1"] == [1, 2, 3, 4, 5]); 1308 1309 read.cigar = [CigarOperation(20, 'M'), CigarOperation(2, 'X')]; 1310 assert(read.cigarString() == "20M2X"); 1311 1312 read["RG"] = cast(float)5.6; 1313 assert(approxEqual(to!float(read["RG"]), 5.6)); 1314 1315 read.sequence = "AGCTGGCTACGTAATAGCCCT"; 1316 assert(read.sequence_length == 21); 1317 assert(read.base_qualities.length == 21); 1318 assert(read.base_qualities[20] == 255); 1319 assert(equal(read.sequence(), "AGCTGGCTACGTAATAGCCCT")); 1320 assert(retro(read.sequence)[2] == 'C'); 1321 assert(retro(read.sequence)[0] == 'T'); 1322 assert(read.sequence[4] == 'G'); 1323 assert(read.sequence[0] == 'A'); 1324 assert(equal(read.sequence[0..8], "AGCTGGCT")); 1325 assert(equal(read.sequence[3..5], "TG")); 1326 assert(equal(read.sequence[3..9][1..4], "GGC")); 1327 1328 read["X1"] = 42; 1329 assert(read["X1"] == 42); 1330 1331 assert(read.tagCount() == 2); 1332 1333 read["X1"] = null; 1334 assert(read["X1"].is_nothing); 1335 assert(read.tagCount() == 1); 1336 read.sequence = "GTAAGCTGGCACTAGCAGCCT"; 1337 read.cigar = [CigarOperation(read.sequence_length, 'M')]; 1338 read["RG"] = null; 1339 read["RG"] = "readgroup1"; 1340 assert(read.tagCount() == 1); 1341 read["RG"] = null; 1342 assert(read.tagCount() == 0); 1343 1344 read.sequence[5] = Base('N'); 1345 read.sequence[6] = Base('A'); 1346 read.sequence[7] = Base('C'); 1347 read.sequence[8] = Base('G'); 1348 read.base_qualities[5] = 42; 1349 assert(read.sequence[5 .. 9].equal("NACG")); 1350 assert(read.base_qualities[5] == 42); 1351 1352 // Test MsgPack serialization/deserialization 1353 1354 { 1355 import std.typecons; 1356 static import bio.std.hts.thirdparty.msgpack; 1357 auto packer = bio.std.hts.thirdparty.msgpack.packer(Appender!(ubyte[])()); 1358 read.toMsgpack(packer); 1359 auto data = packer.stream.data; 1360 auto rec = unpack(data).via.array; 1361 assert(rec[0].as!(ubyte[]) == "anothername"); 1362 assert(rec[5].as!(int[]) == [21]); 1363 assert(rec[6].as!(ubyte[]) == ['M']); 1364 assert(rec[10].as!(ubyte[]) == to!string(read.sequence)); 1365 } 1366 1367 read.clearAllTags(); 1368 assert(read.tagCount() == 0); 1369 } 1370 1371 /// $(P BamRead wrapper which precomputes $(D end_position) = $(D position) + $(D basesCovered()).) 1372 /// 1373 /// $(P Computation of basesCovered() takes quite a few cycles. Therefore in places where this 1374 /// property is frequently accessed, it makes sense to precompute it for later use.) 1375 /// 1376 /// $(P The idea is that this should be a drop-in replacement for BamRead in algorithms, 1377 /// as the struct uses 'alias this' construction for the wrapped read.) 1378 struct EagerBamRead(R=BamRead) { 1379 /// 1380 this(R read) { 1381 this.read = read; 1382 this.end_position = read.position + read.basesCovered(); 1383 } 1384 1385 /// 1386 R read; 1387 /// 1388 alias read this; 1389 1390 /// End position on the reference, computed as position + basesCovered(). 1391 int end_position; 1392 1393 /// 1394 EagerBamRead dup() @property const { 1395 return EagerBamRead(read.dup); 1396 } 1397 } 1398 1399 static assert(is(EagerBamRead!BamRead : BamRead)); 1400 1401 /// Checks if $(D T) behaves like $(D BamRead) 1402 template isBamRead(T) 1403 { 1404 static if (is(Unqual!T : BamRead)) 1405 enum isBamRead = true; 1406 else 1407 enum isBamRead = __traits(compiles, 1408 { 1409 T t; bool p; 1410 p = t.ref_id == 1; p = t.position == 2; p = t.bin.id == 3; 1411 p = t.mapping_quality == 4; p = t.flag == 5; p = t.sequence_length == 6; 1412 p = t.mate_ref_id == 7; p = t.mate_position == 8; p = t.template_length == 9; 1413 p = t.is_paired; p = t.proper_pair; p = t.is_unmapped; 1414 p = t.mate_is_unmapped; p = t.mate_is_reverse_strand; p = t.is_first_of_pair; 1415 p = t.is_second_of_pair; p = t.is_secondary_alignment; p = t.failed_quality_control; 1416 p = t.is_duplicate; p = t.strand == '+'; p = t.name == ""; 1417 p = t.cigar[0].type == 'M'; p = t.basesCovered() > 42; p = t.cigarString() == ""; 1418 p = t.sequence[0] == 'A'; p = t.base_qualities[0] == 0; 1419 }); 1420 } 1421 1422 // Comparison function for 'queryname' sorting order based on https://github.com/samtools/htsjdk/blob/master/src/main/java/htsjdk/samtools/SAMRecordQueryNameComparator.java 1423 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1424 if (isBamRead!R1 && isBamRead!R2) 1425 { 1426 if(a1.name == a2.name) 1427 { 1428 if(a1.is_paired() || a2.is_paired()) 1429 { 1430 if(!a1.is_paired()) 1431 return false; 1432 if(!a2.is_paired()) 1433 return true; 1434 1435 if(a1.is_first_of_pair() && a2.is_second_of_pair()) 1436 return true; 1437 1438 if(a1.is_second_of_pair() && a2.is_first_of_pair()) 1439 return false; 1440 1441 } 1442 1443 if(a1.strand() != a2.strand()) 1444 { 1445 return a1.strand() == '-' ? false : true; 1446 } 1447 1448 if(a1.is_secondary_alignment() != a2.is_secondary_alignment()) 1449 { 1450 return a2.is_secondary_alignment(); 1451 } 1452 1453 if(a1.is_supplementary() != a2.is_supplementary()) 1454 { 1455 return a2.is_supplementary(); 1456 } 1457 1458 if(!a1["HI"].is_nothing) 1459 { 1460 if(a2["HI"].is_nothing) 1461 return true; 1462 1463 int i1 = to!int(a1["HI"]); 1464 int i2 = to!int(a2["HI"]); 1465 return i1 < i2; 1466 } 1467 else 1468 if(!a2["HI"].is_nothing) 1469 return false; 1470 } 1471 return a1.name < a2.name; 1472 } 1473 1474 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1475 if (isBamRead!R1 && isSomeString!R2) 1476 { 1477 return a1.name < a2; 1478 } 1479 1480 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1481 if (isSomeString!R1 && isBamRead!R2) 1482 { 1483 return a1 < a2.name; 1484 } 1485 1486 /// $(P Comparison function for 'queryname' sorting order 1487 /// (return whether first read is 'less' than second)) 1488 /// 1489 /// $(P This function can be called on: 1490 /// $(UL 1491 /// $(LI two reads) 1492 /// $(LI read and string in any order))) 1493 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1494 if (isBamRead!R1 && isBamRead!R2) 1495 { 1496 return a1.name < a2.name; 1497 } 1498 1499 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1500 if (isBamRead!R1 && isSomeString!R2) 1501 { 1502 return a1.name < a2; 1503 } 1504 1505 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1506 if (isSomeString!R1 && isBamRead!R2) 1507 { 1508 return a1 < a2.name; 1509 } 1510 1511 int mixedStrCompare(string a, string b) { 1512 import std.ascii : isDigit; 1513 while (!a.empty && !b.empty) { 1514 if (a.front.isDigit && b.front.isDigit) { 1515 // skip zeros 1516 int za, zb; 1517 while (!a.empty && a.front == '0') { ++za; a.popFront(); } 1518 while (!b.empty && b.front == '0') { ++zb; b.popFront(); } 1519 1520 // skip equal digits 1521 while (!a.empty && !b.empty && a.front.isDigit && a.front == b.front) { 1522 a.popFront(); 1523 b.popFront(); 1524 } 1525 1526 if (!a.empty && !b.empty && a.front.isDigit && b.front.isDigit) { 1527 // the number of leading digits in each string is non-zero 1528 size_t i = 0, maxi = min(a.length, b.length); 1529 while (i < maxi && a[i].isDigit && b[i].isDigit) ++i; 1530 if (i < a.length && a[i].isDigit) return 1; // a contains more digits 1531 if (i < b.length && b[i].isDigit) return -1; // b contains more digits 1532 // the counts are equal, compare first digits 1533 return cast(byte)a.front - cast(byte)b.front; 1534 } else if (!a.empty && a.front.isDigit) return 1; 1535 else if (!b.empty && b.front.isDigit) return -1; 1536 else if (za != zb) return za - zb; // order by the number of leading zeros 1537 } else { 1538 // lexicographical comparison for non-digits 1539 if (a.front != b.front) return cast(byte)a.front - cast(byte)b.front; 1540 a.popFront(); b.popFront(); 1541 } 1542 } 1543 return (!a.empty) ? 1 : (!b.empty) ? -1 : 0; 1544 } 1545 1546 /// $(P Comparison function for 'queryname' sorting order as in Samtools 1547 /// (returns whether first read is 'less' than second in a 'mixed' order, 1548 /// i.e. numbers inside the strings are compared by their integer value)) 1549 /// 1550 /// $(P This function can be called on: 1551 /// $(UL 1552 /// $(LI two reads) 1553 /// $(LI read and string in any order))) 1554 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1555 if (isBamRead!R1 && isBamRead!R2) 1556 { 1557 return mixedStrCompare(a1.name, a2.name) < 0; 1558 } 1559 1560 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1561 if (isBamRead!R1 && isSomeString!R2) 1562 { 1563 return mixedStrCompare(a1.name, a2) < 0; 1564 } 1565 1566 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1567 if (isSomeString!R1 && isBamRead!R2) 1568 { 1569 return mixedStrCompare(a1, a2.name) < 0; 1570 } 1571 1572 unittest { 1573 assert(mixedStrCompare("BC0123", "BC01234") < 0); 1574 assert(mixedStrCompare("BC0123", "BC0123Z") < 0); 1575 assert(mixedStrCompare("BC01234", "BC01234") == 0); 1576 assert(mixedStrCompare("BC0123DEF45", "BC01234DEF45") < 0); 1577 assert(mixedStrCompare("BC01236DEF45", "BC01234DEF45") > 0); 1578 assert(mixedStrCompare("BC012", "BC0012") < 0); 1579 assert(mixedStrCompare("BC0012DE0034", "BC0012DE34") > 0); 1580 assert(mixedStrCompare("BC12DE0034", "BC012DE34") < 0); 1581 assert(mixedStrCompare("1235", "1234") > 0); 1582 } 1583 1584 // small utility function to get the value of the HI tag and return '0' 1585 // if it is not defined 1586 private int getHI(R)(auto ref R r) 1587 if (isBamRead!R) 1588 { 1589 auto v = r["HI"]; 1590 if (v.is_nothing) 1591 return 0; 1592 return to!int(v); 1593 } 1594 1595 /// $(P Comparison function for 'queryname' sorting order setting mates 1596 /// of the same alignments adjacent with the first mate coming before 1597 /// the second mate) 1598 bool compareReadNamesAndMates(R1, R2)(const auto ref R1 r1, const auto ref R2 r2) 1599 if (isBamRead!R1 && isBamRead!R2) 1600 { 1601 if (r1.name == r2.name) { 1602 if (getHI(r1) == getHI(r2)) 1603 return r1.flag() < r2.flag(); 1604 return getHI(r1) < getHI(r2); 1605 } 1606 return r1.name < r2.name; 1607 } 1608 1609 /// $(P Comparison function for 'queryname' sorting order as in Samtools 1610 /// setting mates of the same alignments adjacent with the first mate 1611 /// coming before the second mate) 1612 bool mixedCompareReadNamesAndMates(R1, R2)(const auto ref R1 r1, const auto ref R2 r2) 1613 if (isBamRead!R1 && isBamRead!R2) 1614 { 1615 if (mixedStrCompare(r1.name, r2.name) == 0) { 1616 if (getHI(r1) == getHI(r2)) 1617 return r1.flag() < r2.flag(); 1618 return getHI(r1) < getHI(r2); 1619 } 1620 return mixedStrCompare(r1.name, r2.name) < 0; 1621 } 1622 1623 /// $(P Comparison function for 'coordinate' sorting order 1624 /// (returns whether first read is 'less' than second)) 1625 /// 1626 /// $(P This function can be called on: 1627 /// $(UL 1628 /// $(LI two reads (in this case, reference IDs are also taken into account)) 1629 /// $(LI read and integer in any order))) 1630 1631 /// This function takes read direction into account (used for original samtools style sorting) 1632 bool compareCoordinatesAndStrand(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1633 if (isBamRead!R1 && isBamRead!R2) 1634 { 1635 if (a1.ref_id == -1) return false; // unmapped reads should be last 1636 if (a2.ref_id == -1) return true; 1637 if (a1.ref_id < a2.ref_id) return true; 1638 if (a1.ref_id > a2.ref_id) return false; 1639 if (a1.position < a2.position) return true; 1640 if (a1.position > a2.position) return false; 1641 return !a1.is_reverse_strand && a2.is_reverse_strand; 1642 } 1643 1644 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1645 if (isBamRead!R1 && isBamRead!R2) 1646 { 1647 if (a1.ref_id == -1) return false; // unmapped reads should be last 1648 if (a2.ref_id == -1) return true; 1649 if (a1.ref_id < a2.ref_id) return true; 1650 if (a1.ref_id > a2.ref_id) return false; 1651 return (a1.position < a2.position); 1652 } 1653 1654 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1655 if (isBamRead!R1 && isIntegral!R2) 1656 { 1657 return a1.position < a2; 1658 } 1659 1660 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2) 1661 if (isIntegral!R1 && isBamRead!R2) 1662 { 1663 return a1 < a2.position; 1664 } 1665 1666 static assert(isTwoWayCompatible!(compareReadNames, BamRead, string)); 1667 static assert(isTwoWayCompatible!(compareCoordinates, BamRead, int)); 1668 1669 /// Allows modification of the read in-place even if it's slice-backed. 1670 struct UniqueRead(R) { 1671 R read; 1672 alias read this; 1673 1674 this(R read) { 1675 this.read = read; 1676 this.read._modify_in_place = true; 1677 } 1678 1679 ~this() { 1680 this.read._modify_in_place = false; 1681 } 1682 } 1683 1684 /// ditto 1685 auto assumeUnique(R)(auto ref R read) if (isBamRead!R) { 1686 return UniqueRead!R(read); 1687 }