bio.std.hts.bam.read source code

1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2016    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11 
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14 
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 /// $(P $(D BamRead) type provides convenient interface for working with SAM/BAM records.)
25 ///
26 /// $(P All flags, tags, and fields can be accessed and modified.)
27 ///
28 /// Examples:
29 /// ---------------------------
30 /// import std.conv;
31 /// ...
32 /// assert(!read.is_unmapped);              // check flag
33 /// assert(read.ref_id != -1);              // access field
34 ///
35 /// int edit_distance = to!int(read["NM"]); // access tag
36 /// read["NM"] = 0;                         // modify tag
37 /// read["NM"] = null;                      // remove tag
38 /// read["NM"] = null;                      // no-op
39 ///
40 /// foreach (tag, value; read)              // iterate over tags
41 ///     writeln(tag, " ", value);           // and print their keys and values
42 ///
43 /// read.sequence = "AGCAGACTACGTGTGCATAC"; // sets base qualities to 255
44 /// assert(read.base_qualities[0] == 255);
45 /// read.is_unmapped = true;                // set flag
46 /// read.ref_id = -1;                       // set field
47 /// ---------------------------
48 module bio.std.hts.bam.read;
49 
50 import bio.core.base;
51 import bio.core.utils.format;
52 
53 import bio.std.hts.bam.abstractreader;
54 import bio.std.hts.bam.cigar;
55 import bio.std.hts.bam.writer;
56 import bio.std.hts.bam.tagvalue;
57 import bio.std.hts.bam.bai.bin;
58 
59 import bio.std.hts.bam.md.core;
60 
61 import bio.std.hts.utils.array;
62 import bio.std.hts.utils.value;
63 import bio.core.utils.switchendianness;
64 
65 import bio.std.hts.thirdparty.msgpack : Packer, unpack;
66 
67 import std.algorithm;
68 import std.range;
69 import std.conv;
70 import std.format;
71 import std.exception;
72 import std.system;
73 import std.traits;
74 import std.array;
75 import std.bitmanip;
76 import core.stdc.stdlib;
77 
78 /**
79   BAM record representation.
80 */
81 struct BamRead {
82 
83     mixin TagStorage;
84 
85     /// Reference index in BAM file header
86     @property    int ref_id()           const nothrow { return _refID; }
87     /// ditto
88     @property   void ref_id(int n)                    { _dup(); _refID = n; }
89 
90     /// Reference sequence name ('*' for unmapped reads)
91     @property string ref_name()         const nothrow { return _ref_id_to_string(ref_id); }
92 
93     /// 0-based leftmost coordinate of the first matching base
94     @property    int position()         const nothrow { return _pos; }
95     /// ditto
96     @property   void position(int n)                  { _dup(); _pos = n; _recalculate_bin(); }
97 
98     /// Indexing bin which this read belongs to. Recalculated when position is changed.
99     @property    bio.std.hts.bam.bai.bin.Bin bin()              const nothrow { return Bin(_bin); }
100 
101     /// Mapping quality. Equals to 255 if not available, otherwise
102     /// equals to rounded -10 * log10(P {mapping position is wrong}).
103     @property  ubyte mapping_quality()  const nothrow { return _mapq; }
104     /// ditto
105     @property   void mapping_quality(ubyte n)         { _dup(); _mapq = n; }
106 
107     /// Flag bits (should be used on very rare occasions, see flag getters/setters below)
108     @property ushort flag()             const nothrow { return _flag; }
109     /// ditto
110     @property   void flag(ushort n)                   { _dup(); _flag = n; }
111 
112     /// Sequence length. In fact, sequence.length can be used instead, but that might be
113     /// slower if the compiler is not smart enough to optimize away unrelated stuff.
114     @property    int sequence_length()  const nothrow { return _l_seq; }
115 
116     /// Mate reference ID
117     @property    int mate_ref_id()      const nothrow { return _next_refID; }
118     /// ditto
119     @property   void mate_ref_id(int n)               { _dup(); _next_refID = n; }
120 
121     /// Mate reference sequence name ('*' for unmapped mates)
122     @property string mate_ref_name()    const nothrow { return _ref_id_to_string(_next_refID); }
123 
124     /// Mate position
125     @property    int mate_position()    const nothrow { return _next_pos; }
126     /// ditto
127     @property   void mate_position(int n)             { _dup(); _next_pos = n; }
128 
129     /// Template length
130     @property    int template_length()  const nothrow { return _tlen; }
131     /// ditto
132     @property   void template_length(int n)           { _dup(); _tlen = n; }
133 
134     // ------------------------ FLAG GETTERS/SETTERS -------------------------------------- //
135 
136     /// Template having multiple segments in sequencing
137     @property bool is_paired()                const nothrow { return cast(bool)(flag & 0x1); }
138     /// ditto
139     @property void is_paired(bool b)                { _setFlag( 0, b); }
140 
141     /// Each segment properly aligned according to the aligner
142     @property bool proper_pair()              const nothrow { return cast(bool)(flag & 0x2); }
143     /// ditto
144     @property void proper_pair(bool b)              { _setFlag( 1, b); }
145 
146     /// Segment unmapped
147     @property bool is_unmapped()              const nothrow { return cast(bool)(flag & 0x4); }
148     /// ditto
149     @property void is_unmapped(bool b)              { _setFlag( 2, b); }
150 
151     /// Next segment in the template unmapped
152     @property bool mate_is_unmapped()         const nothrow { return cast(bool)(flag & 0x8); }
153     /// ditto
154     @property void mate_is_unmapped(bool b)         { _setFlag( 3, b); }
155 
156     /// Sequence being reverse complemented
157     @property bool is_reverse_strand()        const nothrow { return cast(bool)(flag & 0x10); }
158     /// ditto
159     @property void is_reverse_strand(bool b)        { _setFlag( 4, b); }
160 
161     /// Sequence of the next segment in the template being reversed
162     @property bool mate_is_reverse_strand()   const nothrow { return cast(bool)(flag & 0x20); }
163     /// ditto
164     @property void mate_is_reverse_strand(bool b)   { _setFlag( 5, b); }
165 
166     /// The first segment in the template
167     @property bool is_first_of_pair()         const nothrow { return cast(bool)(flag & 0x40); }
168     /// ditto
169     @property void is_first_of_pair(bool b)         { _setFlag( 6, b); }
170 
171     /// The last segment in the template
172     @property bool is_second_of_pair()        const nothrow { return cast(bool)(flag & 0x80); }
173     /// ditto
174     @property void is_second_of_pair(bool b)        { _setFlag( 7, b); }
175 
176     /// Secondary alignment
177     @property bool is_secondary_alignment()   const nothrow { return cast(bool)(flag & 0x100); }
178     /// ditto
179     @property void is_secondary_alignment(bool b)   { _setFlag( 8, b); }
180 
181     /// Not passing quality controls
182     @property bool failed_quality_control()   const nothrow { return cast(bool)(flag & 0x200); }
183     /// ditto
184     @property void failed_quality_control(bool b)   { _setFlag( 9, b); }
185 
186     /// PCR or optical duplicate
187     @property bool is_duplicate()             const nothrow { return cast(bool)(flag & 0x400); }
188     /// ditto
189     @property void is_duplicate(bool b)             { _setFlag(10, b); }
190 
191     /// Supplementary alignment
192     @property bool is_supplementary()         const nothrow { return cast(bool)(flag & 0x800); }
193     /// ditto
194     @property void is_supplementary(bool b)         { _setFlag(11, b); }
195 
196     /// Convenience function, returns '+' or '-' indicating the strand.
197     @property char strand() const nothrow {
198         return is_reverse_strand ? '-' : '+';
199     }
200 
201     /// ditto
202     @property void strand(char c) {
203         enforce(c == '-' || c == '+', "Strand must be '-' or '+'");
204         is_reverse_strand = c == '-';
205     }
206 
207     /// Read name, length must be in 1..255 interval.
208     @property string name() const nothrow {
209         // notice -1: the string is zero-terminated, so we should strip that '\0'
210         return cast(string)(_chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1]);
211     }
212 
213     /// ditto
214     @property void name(string new_name) {
215         enforce(new_name.length >= 1 && new_name.length <= 255,
216                 "name length must be in 1-255 range");
217         _dup();
218         bio.std.hts.utils.array.replaceSlice(_chunk,
219                  _chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1],
220                  cast(ubyte[])new_name);
221         _l_read_name = cast(ubyte)(new_name.length + 1);
222     }
223 
224     /// List of CIGAR operations
225     @property const(CigarOperation)[] cigar() const nothrow {
226         return cast(const(CigarOperation)[])(_chunk[_cigar_offset .. _cigar_offset +
227                                              _n_cigar_op * CigarOperation.sizeof]);
228     }
229 
230     /// ditto
231     @property void cigar(const(CigarOperation)[] c) {
232         enforce(c.length < 65536, "Too many CIGAR operations, must be <= 65535");
233         _dup();
234         bio.std.hts.utils.array.replaceSlice(_chunk,
235              _chunk[_cigar_offset .. _cigar_offset + _n_cigar_op * CigarOperation.sizeof],
236              cast(ubyte[])c);
237 
238         _n_cigar_op = cast(ushort)(c.length);
239 
240         _recalculate_bin();
241     }
242 
243     /// Extended CIGAR where M operators are replaced with =/X based
244     /// on information from MD tag. Throws if the read doesn't have MD
245     /// tag.
246     auto extended_cigar() @property const {
247         Value md = this["MD"];
248         enforce(md.is_string);
249         return makeExtendedCigar(cigar, mdOperations(*cast(string*)(&md)));
250     }
251 
252     /// The number of reference bases covered by this read.
253     /// $(BR)
254     /// Returns 0 if the read is unmapped.
255     int basesCovered() const {
256 
257         if (this.is_unmapped) {
258             return 0; // actually, valid alignments should have empty cigar string
259         }
260 
261         return reduce!"a + b.length"(0, filter!"a.is_reference_consuming"(cigar));
262     }
263 
264     /// Human-readable representation of CIGAR string (same as in SAM format)
265     string cigarString() const {
266         char[] str;
267 
268         // guess size of resulting string
269         str.reserve(_n_cigar_op * 3);
270 
271         foreach (cigar_op; cigar) {
272             str ~= to!string(cigar_op.length);
273             str ~= cigar_op.type;
274         }
275         return cast(string)str;
276     }
277 
278     private @property inout(ubyte)[] raw_sequence_data() inout nothrow {
279         return _chunk[_seq_offset .. _seq_offset + (_l_seq + 1) / 2];
280     }
281 
282     /// Read-only random-access range for access to sequence data.
283     static struct SequenceResult {
284 
285         private size_t _index;
286         private ubyte[] _data = void;
287         private size_t _len = void;
288         private bool _use_first_4_bits = void;
289 
290         this(const(ubyte[]) data, size_t len, bool use_first_4_bits=true) {
291             _data = cast(ubyte[])data;
292             _len = len;
293             _use_first_4_bits = use_first_4_bits;
294         }
295 
296         ///
297         @property bool empty() const {
298             return _index >= _len;
299         }
300 
301         ///
302         @property bio.core.base.Base front() const {
303             return opIndex(0);
304         }
305 
306         ///
307         @property bio.core.base.Base back() const {
308             return opIndex(_len - 1);
309         }
310 
311         /*
312         I have no fucking idea why this tiny piece of code
313         does NOT get inlined by stupid DMD compiler.
314 
315         Therefore I use string mixin instead.
316         (hell yeah! Back to the 90s! C macros rulez!)
317 
318         private size_t _getActualPosition(size_t index) const
319         {
320             if (_use_first_4_bits) {
321                 // [0 1] [2 3] [4 5] [6 7] ...
322                 //            |
323                 //            V
324                 //   0     1     2     3
325                 return index >> 1;
326             } else {
327                 // [. 0] [1 2] [3 4] [5 6] ...
328                 //            |
329                 //            V
330                 //   0     1     2     3
331                 return (index >> 1) + (index & 1);
332             }
333         }*/
334 
335         private static string _getActualPosition(string index) {
336             return "((" ~ index ~") >> 1) + " ~
337                    "(_use_first_4_bits ? 0 : ((" ~ index ~ ") & 1))";
338         }
339 
340         private bool _useFirst4Bits(size_t index) const
341         {
342             auto res = index % 2 == 0;
343             if (!_use_first_4_bits) {
344                 res = !res;
345             }
346             return res;
347         }
348 
349         ///
350         @property SequenceResult save() const {
351             return SequenceResult(_data[mixin(_getActualPosition("_index")) .. $],
352                                   _len - _index,
353                                   _useFirst4Bits(_index));
354         }
355 
356         ///
357         SequenceResult opSlice(size_t i, size_t j) const {
358             return SequenceResult(_data[mixin(_getActualPosition("_index + i")) .. $],
359                                   j - i,
360                                   _useFirst4Bits(_index + i));
361         }
362 
363         ///
364         @property bio.core.base.Base opIndex(size_t i) const {
365             auto pos = _index + i;
366 
367             if (_use_first_4_bits)
368             {
369                 if (pos & 1)
370                     return Base.fromInternalCode(_data[pos >> 1] & 0xF);
371                 else
372                     return Base.fromInternalCode(_data[pos >> 1] >> 4);
373             }
374             else
375             {
376                 if (pos & 1)
377                     return Base.fromInternalCode(_data[(pos >> 1) + 1] >> 4);
378                 else
379                     return Base.fromInternalCode(_data[pos >> 1] & 0xF);
380             }
381 
382             assert(false);
383         }
384 
385         /// ditto
386         @property void opIndexAssign(bio.core.base.Base base, size_t i) {
387             auto pos = _index + i;
388 
389             if (_use_first_4_bits)
390             {
391                 if (pos & 1)
392                     _data[pos >> 1] &= 0xF0, _data[pos >> 1] |= base.internal_code;
393                 else
394                     _data[pos >> 1] &= 0x0F, _data[pos >> 1] |= base.internal_code << 4;
395             }
396             else
397             {
398                 if (pos & 1)
399                     _data[(pos >> 1) + 1] &= 0x0F, _data[(pos >> 1) + 1] |= base.internal_code << 4;
400                 else
401                     _data[pos >> 1] &= 0xF0, _data[pos >> 1] |= base.internal_code;
402             }
403         }
404 
405 
406         ///
407         void popFront() {
408             ++_index;
409         }
410 
411         ///
412         void popBack() {
413             --_len;
414         }
415 
416         ///
417         @property size_t length() const {
418             return _len - _index;
419         }
420 
421         alias length opDollar;
422 
423         void toString(scope void delegate(const(char)[]) dg) const {
424             char[256] buf = void;
425             size_t total = this.length;
426             size_t written = 0;
427             while (written < total) {
428                 size_t n = min(buf.length, total - written);
429                 foreach (j; 0 .. n)
430                     buf[j] = opIndex(written + j).asCharacter;
431                 dg(buf[0 .. n]);
432                 written += n;
433             }
434         }
435     }
436 
437     /// Random-access range of characters
438     @property SequenceResult sequence() const {
439         return SequenceResult(raw_sequence_data, sequence_length);
440     }
441 
442     static assert(isRandomAccessRange!(ReturnType!sequence));
443 
444     /// Sets query sequence. Sets all base qualities to 255 (i.e. unknown).
445     @property void sequence(string seq)
446     {
447         _dup();
448 
449         auto raw_length = (seq.length + 1) / 2;
450         // set sequence
451         auto replacement = uninitializedArray!(ubyte[])(raw_length + seq.length);
452         replacement[raw_length .. $] = 0xFF;
453         for (size_t i = 0; i < raw_length; ++i) {
454             replacement[i] = cast(ubyte)(Base(seq[2 * i]).internal_code << 4);
455 
456             if (seq.length > 2 * i + 1)
457                 replacement[i] |= cast(ubyte)(Base(seq[2 * i + 1]).internal_code);
458         }
459 
460         bio.std.hts.utils.array.replaceSlice(_chunk,
461                      _chunk[_seq_offset .. _tags_offset],
462                      replacement);
463 
464         _l_seq = cast(int)seq.length;
465     }
466 
467     /// Quality data (phred-based scores)
468     @property inout(ubyte)[] base_qualities() inout nothrow {
469         return _chunk[_qual_offset .. _qual_offset + _l_seq * char.sizeof];
470     }
471 
472     /// Set quality data - array length must be of the same length as the sequence.
473     @property void base_qualities(const(ubyte)[] quality) {
474         enforce(quality.length == _l_seq, "Quality data must be of the same length as sequence");
475         _dup();
476         _chunk[_qual_offset .. _qual_offset + _l_seq] = quality;
477     }
478 
479     /*
480       Constructs the struct from memory chunk
481       */
482     this(ubyte[] chunk, bool fix_byte_order=true) {
483 
484         // Switching endianness lazily is not a good idea:
485         //
486         // 1) switching byte order is pretty fast
487         // 2) lazy switching for arrays can kill the performance,
488         //    it has to be done once
489         // 3) the code will be too complicated, whereas there're
490         //    not so many users of big-endian systems
491         //
492         // In summa, BAM is little-endian format, so big-endian
493         // users will suffer anyway, it's unavoidable.
494 
495         _chunk = chunk;
496         this._is_slice = true;
497 
498         if (fix_byte_order && std.system.endian != Endian.littleEndian) {
499             switchChunkEndianness();
500 
501             // Dealing with tags is the responsibility of TagStorage.
502             fixTagStorageByteOrder();
503         }
504     }
505 
506     // Doesn't touch tags, only fields.
507     // @@@TODO: NEEDS TESTING@@@
508     private void switchChunkEndianness() {
509         // First 8 fields are 32-bit integers:
510         //
511         // 0) refID                int
512         // 1) pos                  int
513         // 2) bin_mq_nl           uint
514         // 3) flag_nc             uint
515         // 4) l_seq                int
516         // 5) next_refID           int
517         // 6) next_pos             int
518         // 7) tlen                 int
519         // ----------------------------------------------------
520         // (after them name follows which is string)
521         //
522         switchEndianness(_chunk.ptr, 8 * uint.sizeof);
523 
524         // Then we need to switch endianness of CIGAR data:
525         switchEndianness(_chunk.ptr + _cigar_offset,
526                          _n_cigar_op * uint.sizeof);
527     }
528 
529     private size_t calculateChunkSize(string read_name,
530                                       string sequence,
531                                       in CigarOperation[] cigar)
532     {
533         return 8 * int.sizeof
534                  + (read_name.length + 1) // tailing '\0'
535                  + uint.sizeof * cigar.length
536                  + ubyte.sizeof * ((sequence.length + 1) / 2)
537                  + ubyte.sizeof * sequence.length;
538     }
539 
540     /// Construct alignment from basic information about it.
541     ///
542     /// Other fields can be set afterwards.
543     this(string read_name,                          // info for developers:
544          string sequence,                           // these 3 fields are needed
545          in CigarOperation[] cigar)                 // to calculate size of _chunk
546     {
547         enforce(read_name.length < 256, "Too long read name, length must be <= 255");
548         enforce(cigar.length < 65536, "Too many CIGAR operations, must be <= 65535");
549 
550         if (this._chunk is null) {
551             this._chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)];
552         }
553 
554         this._refID      =  -1;         // set default values
555         this._pos        =  -1;         // according to SAM/BAM
556         this._mapq       = 255;         // specification
557         this._next_refID =  -1;
558         this._next_pos   =  -1;
559         this._tlen       =   0;
560 
561         this._l_read_name = cast(ubyte)(read_name.length + 1); // tailing '\0'
562         this._n_cigar_op  = cast(ushort)(cigar.length);
563         this._l_seq       = cast(int)(sequence.length);
564 
565         // now all offsets can be calculated through corresponding properties
566 
567         // set default quality
568         _chunk[_qual_offset .. _qual_offset + sequence.length] = 0xFF;
569 
570         // set CIGAR data
571         auto _len = cigar.length * CigarOperation.sizeof;
572         _chunk[_cigar_offset .. _cigar_offset + _len] = cast(ubyte[])(cigar);
573 
574         // set read_name
575         auto _offset = _read_name_offset;
576         _chunk[_offset .. _offset + read_name.length] = cast(ubyte[])read_name;
577         _chunk[_offset + read_name.length] = cast(ubyte)'\0';
578 
579         this._is_slice = false;
580 
581         this.sequence = sequence;
582     }
583 
584     /// Deep copy of the record.
585     BamRead dup() @property const {
586         BamRead result;
587         result._chunk = this._chunk.dup;
588         result._is_slice = false;
589         result._modify_in_place = false;
590         result._reader = cast()_reader;
591         return result;
592     }
593 
594     /// Compare two alignments, including tags
595     /// (the tags must follow in the same order for equality).
596     bool opEquals(BamRead other) const pure nothrow {
597         // don't forget about _is_slice trick
598         auto m = _cigar_offset;
599         return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] &&
600                _chunk[m .. $] == other._chunk[m .. $];
601     }
602 
603     bool opEquals(const(BamRead) other) const pure nothrow {
604         return opEquals(cast()other);
605     }
606 
607     /// Size of the alignment record when output to stream in BAM format.
608     /// Includes block_size as well (see SAM/BAM specification)
609     @property size_t size_in_bytes() const {
610         return int.sizeof + _chunk.length;
611     }
612 
613     package void write(BamWriter writer) {
614         writer.writeInteger(cast(int)(_chunk.length));
615 
616         ubyte old_byte = _chunk[_cigar_offset - 1];
617         _chunk[_cigar_offset - 1] = 0;
618 
619         if (std.system.endian != Endian.littleEndian) {
620             switchChunkEndianness();
621             writer.writeByteArray(_chunk[0 .. _tags_offset]);
622             switchChunkEndianness();
623         } else {
624             writer.writeByteArray(_chunk[0 .. _tags_offset]);
625         }
626 
627         _chunk[_cigar_offset - 1] = old_byte;
628 
629         writeTags(writer);
630     }
631 
632     /// Packs message in the following format:
633     /// $(BR)
634     /// MsgPack array with elements
635     ///   $(OL
636     ///     $(LI name - string)
637     ///     $(LI flag - ushort)
638     ///     $(LI reference sequence id - int)
639     ///     $(LI leftmost mapping position (1-based) - int)
640     ///     $(LI mapping quality - ubyte)
641     ///     $(LI array of CIGAR operation lengths - int[])
642     ///     $(LI array of CIGAR operation types - ubyte[])
643     ///     $(LI mate reference sequence id - int)
644     ///     $(LI mate position (1-based) - int)
645     ///     $(LI template length - int)
646     ///     $(LI segment sequence - string)
647     ///     $(LI phred-base quality - ubyte[])
648     ///     $(LI tags - map: string -> value))
649     void toMsgpack(Packer)(ref Packer packer) const {
650         packer.beginArray(13);
651         packer.pack(cast(ubyte[])name);
652         packer.pack(flag);
653         packer.pack(ref_id);
654         packer.pack(position + 1);
655         packer.pack(mapping_quality);
656         packer.pack(array(map!"a.length"(cigar)));
657         packer.pack(array(map!"a.type"(cigar)));
658         packer.pack(mate_ref_id);
659         packer.pack(mate_position);
660         packer.pack(template_length);
661         packer.pack(to!string(sequence));
662         packer.pack(base_qualities);
663 
664         packer.beginMap(tagCount());
665         foreach (key, value; this) {
666             packer.pack(key);
667             packer.pack(value);
668         }
669     }
670 
671     /// String representation.
672     /// $(BR)
673     /// Possible formats are SAM ("%s") and JSON ("%j")
674     void toString(scope void delegate(const(char)[]) sink, FormatSpec!char fmt) const {
675         if (size_in_bytes < 10000 && fmt.spec == 's') {
676             auto p = cast(char*)alloca(size_in_bytes * 5);
677             char* end = p;
678             toSam(end);
679             sink(p[0 .. end - p]);
680         } else if (size_in_bytes < 5000 && fmt.spec == 'j') {
681             auto p = cast(char*)alloca(size_in_bytes * 10 + 1000);
682             char* end = p;
683             toJson(end);
684             sink(p[0 .. end - p]);
685         } else if (fmt.spec == 's') {
686             toSam(sink);
687         } else if (fmt.spec == 'j') {
688             toJson(sink);
689         } else {
690             throw new FormatException("unknown format specifier");
691         }
692     }
693 
694     /// ditto
695     void toSam(Sink)(auto ref Sink sink) const
696         if (isSomeSink!Sink)
697     {
698         sink.write(name);
699         sink.write('\t');
700         sink.write(flag);
701         sink.write('\t');
702         if (ref_id == -1 || _reader is null)
703             sink.write('*');
704         else
705             sink.write(_reader.reference_sequences[ref_id].name);
706 
707         sink.write('\t');
708         sink.write(position + 1);
709         sink.write('\t');
710         sink.write(mapping_quality);
711         sink.write('\t');
712 
713         if (cigar.length == 0)
714             sink.write('*');
715         else
716             foreach (op; cigar)
717                 op.toSam(sink);
718 
719         sink.write('\t');
720 
721         if (mate_ref_id == ref_id) {
722             if (mate_ref_id == -1)
723                 sink.write("*\t");
724             else
725                 sink.write("=\t");
726         } else {
727             if (mate_ref_id == -1 || _reader is null) {
728                 sink.write("*\t");
729             } else {
730                 auto mate_name = _reader.reference_sequences[mate_ref_id].name;
731                 sink.write(mate_name);
732                 sink.write("\t");
733             }
734         }
735 
736         sink.write(mate_position + 1);
737         sink.write('\t');
738         sink.write(template_length);
739         sink.write('\t');
740 
741         if (sequence_length == 0)
742             sink.write('*');
743         else
744             foreach (char c; sequence)
745                 sink.write(c);
746         sink.write('\t');
747 
748         if (base_qualities.length == 0 || base_qualities[0] == 0xFF)
749             sink.write('*');
750         else
751             foreach (qual; base_qualities)
752                 sink.write(cast(char)(qual + 33));
753 
754         foreach (k, v; this) {
755             sink.write('\t');
756             sink.write(k);
757             sink.write(':');
758             v.toSam(sink);
759         }
760     }
761 
762     /// ditto
763     string toSam()() const {
764         return to!string(this);
765     }
766 
767     /// JSON representation
768     void toJson(Sink)(auto ref Sink sink) const
769         if (isSomeSink!Sink)
770     {
771         sink.write(`{"qname":`); sink.writeJson(name);
772         sink.write(`,"flag":`); sink.write(flag);
773 
774         sink.write(`,"rname":`);
775         if (ref_id == -1 || _reader is null)
776             sink.write(`"*"`);
777         else
778             sink.writeJson(_reader.reference_sequences[ref_id].name);
779 
780         sink.write(`,"pos":`); sink.write(position + 1);
781         sink.write(`,"mapq":`); sink.write(mapping_quality);
782 
783         sink.write(`,"cigar":"`);
784         if (cigar.empty)
785             sink.write('*');
786         else
787             foreach (op; cigar)
788                 op.toSam(sink);
789         sink.write('"');
790 
791         sink.write(`,"rnext":`);
792         if (mate_ref_id == ref_id) {
793             if (mate_ref_id == -1)
794                 sink.write(`"*"`);
795             else
796                 sink.write(`"="`);
797         } else if (mate_ref_id == -1 || _reader is null) {
798             sink.write(`"*"`);
799         } else {
800             sink.writeJson(_reader.reference_sequences[mate_ref_id].name);
801         }
802 
803         sink.write(`,"pnext":`); sink.write(mate_position + 1);
804         sink.write(`,"tlen":`); sink.write(template_length);
805 
806         sink.write(`,"seq":"`);
807         if (sequence_length == 0)
808             sink.write('*');
809         else
810             foreach (char c; sequence)
811                 sink.write(c);
812         sink.write('"');
813 
814         sink.write(`,"qual":`);
815         sink.writeJson(base_qualities);
816 
817         sink.write(`,"tags":{`);
818 
819         bool not_first = false;
820         foreach (k, v; this) {
821             if (not_first)
822                 sink.write(',');
823             sink.writeJson(k);
824             sink.write(':');
825             v.toJson(sink);
826             not_first = true;
827         }
828 
829         sink.write("}}");
830     }
831 
832     /// ditto
833     string toJson()() const {
834         auto w = appender!(char[])();
835         toJson((const(char)[] s) { w.put(s); });
836         return cast(string)w.data;
837     }
838 
839     /// Associates read with BAM reader. This is done automatically
840     /// if this read is obtained through BamReader/Reference methods.
841     void associateWithReader(bio.std.hts.bam.abstractreader.IBamSamReader reader) {
842         _reader = reader;
843     }
844 
845     /// Associated BAM/SAM reader.
846     inout(bio.std.hts.bam.abstractreader.IBamSamReader) reader() @property inout {
847         return _reader;
848     }
849 
850     ///
851     bool is_slice_backed() @property const {
852         return _is_slice;
853     }
854 
855     /// Raw representation of the read. Occasionally useful for dirty hacks!
856     inout(ubyte)[] raw_data() @property inout {
857         return _chunk;
858     }
859 
860     /// ditto
861     void raw_data(ubyte[] data) @property {
862         _chunk = data;
863     }
864 
865     package ubyte[] _chunk; // holds all the data,
866                     // the access is organized via properties
867                     // (see below)
868 
869 private:
870 
871     // by specs, name ends with '\0'
872     // let's use this byte for something useful!
873     //
874     // (Of course this places some restrictions on usage,
875     //  but allows to reduce size of record.)
876     bool _is_slice() @property const {
877         return cast(bool)(_chunk[_cigar_offset - 1] & 1);
878     }
879 
880     void _is_slice(bool is_slice) @property {
881         _chunk[_cigar_offset - 1] &= 0b11111110;
882         _chunk[_cigar_offset - 1] |= (is_slice ? 1 : 0);
883     }
884 
885     // don't call _dup() if the record is modified
886     bool _modify_in_place() @property const {
887         return cast(bool)(_chunk[_cigar_offset - 1] & 2);
888     }
889 
890     void _modify_in_place(bool in_place) @property {
891         _chunk[_cigar_offset - 1] &= 0b11111101;
892         _chunk[_cigar_offset - 1] |= (in_place ? 2 : 0);
893     }
894 
895     IBamSamReader _reader;
896 
897     string _ref_id_to_string(int ref_id) const nothrow {
898         if (_reader is null)
899             return "?";
900         if (ref_id < 0)
901             return "*";
902         return _reader.reference_sequences[ref_id].name;
903     }
904 
905     // Official field names from SAM/BAM specification.
906     // For internal use only
907     @property  int _refID()      const nothrow {
908         return *(cast( int*)(_chunk.ptr + int.sizeof * 0));
909     }
910 
911     @property  int _pos()        const nothrow {
912         return *(cast( int*)(_chunk.ptr + int.sizeof * 1));
913     }
914 
915     @property uint _bin_mq_nl()  const nothrow pure @system {
916         return *(cast(uint*)(_chunk.ptr + int.sizeof * 2));
917     }
918 
919     @property uint _flag_nc()    const nothrow {
920         return *(cast(uint*)(_chunk.ptr + int.sizeof * 3));
921     }
922 
923     @property  int _l_seq()      const nothrow {
924         return *(cast( int*)(_chunk.ptr + int.sizeof * 4));
925     }
926 
927     @property  int _next_refID() const nothrow {
928         return *(cast( int*)(_chunk.ptr + int.sizeof * 5));
929     }
930 
931     @property  int _next_pos()   const nothrow {
932         return *(cast( int*)(_chunk.ptr + int.sizeof * 6));
933     }
934 
935     @property  int _tlen()       const nothrow {
936         return *(cast( int*)(_chunk.ptr + int.sizeof * 7));
937     }
938 
939     // Setters, also only for internal use
940     @property void _refID(int n)       { *(cast( int*)(_chunk.ptr + int.sizeof * 0)) = n; }
941     @property void _pos(int n)         { *(cast( int*)(_chunk.ptr + int.sizeof * 1)) = n; }
942     @property void _bin_mq_nl(uint n)  { *(cast(uint*)(_chunk.ptr + int.sizeof * 2)) = n; }
943     @property void _flag_nc(uint n)    { *(cast(uint*)(_chunk.ptr + int.sizeof * 3)) = n; }
944     @property void _l_seq(int n)       { *(cast( int*)(_chunk.ptr + int.sizeof * 4)) = n; }
945     @property void _next_refID(int n)  { *(cast( int*)(_chunk.ptr + int.sizeof * 5)) = n; }
946     @property void _next_pos(int n)    { *(cast( int*)(_chunk.ptr + int.sizeof * 6)) = n; }
947     @property void _tlen(int n)        { *(cast( int*)(_chunk.ptr + int.sizeof * 7)) = n; }
948 
949     // Additional useful properties, also from SAM/BAM specification
950     //
951     //             The layout of bin_mq_nl and flag_nc is as follows
952     //                     (upper bits -------> lower bits):
953     //
954     // bin_mq_nl [ { bin (16b) }  { mapping quality (8b) } { read name length (8b) } ]
955     //
956     // flag_nc   [ { flag (16b) } { n_cigar_op (16b) } ]
957     //
958     @property ushort _bin()         const nothrow {
959         return _bin_mq_nl >> 16;
960     }
961     @property  ubyte _mapq()        const nothrow {
962         return (_bin_mq_nl >> 8) & 0xFF;
963     }
964     @property  ubyte _l_read_name() const nothrow pure {
965         return _bin_mq_nl & 0xFF;
966     }
967     @property ushort _flag()        const nothrow {
968         return _flag_nc >> 16;
969     }
970     @property ushort _n_cigar_op()  const nothrow {
971         return _flag_nc & 0xFFFF;
972     }
973 
974     // Setters for those properties
975     @property void _bin(ushort n)         { _bin_mq_nl = (_bin_mq_nl &  0xFFFF) | (n << 16); }
976     @property void _mapq(ubyte n)         { _bin_mq_nl = (_bin_mq_nl & ~0xFF00) | (n << 8); }
977     @property void _l_read_name(ubyte n)  { _bin_mq_nl = (_bin_mq_nl & ~0xFF  ) | n; }
978     @property void _flag(ushort n)        { _flag_nc   = (_flag_nc   &  0xFFFF) | (n << 16); }
979     @property void _n_cigar_op(ushort n)  { _flag_nc   = (_flag_nc   & ~0xFFFF) | n; }
980 
981     // Offsets of various arrays in bytes.
982     // Currently, are computed each time, so if speed will be an issue,
983     // they can be made fields instead of properties.
984     @property size_t _read_name_offset() const nothrow pure {
985         return 8 * int.sizeof;
986     }
987 
988     @property size_t _cigar_offset()     const nothrow pure {
989         return _read_name_offset + _l_read_name * char.sizeof;
990     }
991 
992     @property size_t _seq_offset()       const nothrow {
993         return _cigar_offset + _n_cigar_op * uint.sizeof;
994     }
995 
996     @property size_t _qual_offset()      const nothrow {
997         return _seq_offset + (_l_seq + 1) / 2;
998     }
999 
1000     // Offset of auxiliary data
1001     @property size_t _tags_offset()      const nothrow {
1002         return _qual_offset + _l_seq;
1003     }
1004 
1005     // Sets n-th flag bit to boolean value b.
1006     void _setFlag(int n, bool b) {
1007         assert(n < 16);
1008         // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
1009         ushort mask = cast(ushort)(1 << n);
1010         _flag = (_flag & ~cast(int)(mask)) | ((-cast(int)b) & mask);
1011     }
1012 
1013     // If _chunk is still a slice, not an array, duplicate it.
1014     // Used when some part of alignment record is modified by user.
1015     //
1016     // Basically, it's sort of copy-on-write: a lot of read-only alignments
1017     // may point to the same location, but every modified one allocates its
1018     // own chunk of memory.
1019     void _dup() {
1020         if (_is_slice && !_modify_in_place) {
1021             _chunk = _chunk.dup;
1022             _is_slice = false;
1023         }
1024     }
1025 
1026 public:
1027     // Calculates bin number.
1028     void _recalculate_bin() {
1029         _bin = reg2bin(position, position + basesCovered());
1030     }
1031 }
1032 
1033 
1034 /// Lazy tag storage.
1035 ///
1036 ///   Provides hash-like access and opportunity to iterate
1037 ///   storage like an associative array.
1038 mixin template TagStorage() {
1039 
1040     // Provides access to chunk of memory which contains tags.
1041     // This way, every time _tags_offset gets updated
1042     // (due to update of cigar string/read name/sequence and memory move),
1043     // the change is reflected automatically in tag storage.
1044     private @property const(ubyte)[] _tags_chunk() const  {
1045         return _chunk[_tags_offset .. $];
1046     }
1047 
1048     /// Hash-like access to tags. Time complexity is $(BIGOH number of tags).
1049     /// $(BR)
1050     /// If tag with such $(I key) is not found, returned value 'is nothing'.
1051     /// $(BR)
1052     /// If key length is different from 2, exception is thrown.
1053     /// $(BR)
1054     /// Special case when $(I value) represents nothing is used for removing tag
1055     /// (assuming that no more than one with this key is presented in the record).
1056     ///
1057     /// Examples:
1058     /// ----------------------------
1059     /// auto v = read["NM"];
1060     /// assert(v.is_integer);
1061     ///
1062     /// auto v = read["MN"];
1063     /// assert(v.is_nothing); // no such tag
1064     ///
1065     /// read["NM"] = 3; // converted to bio.std.hts.bam.tagvalue.Value implicitly
1066     ///
1067     /// read["NM"] = null; // removes tag
1068     /// assert(read["NM"].is_nothing);
1069     /// ----------------------------
1070     bio.std.hts.bam.tagvalue.Value opIndex(string key) const {
1071         enforce(key.length == 2, "Key length must be 2");
1072         auto __tags_chunk = _tags_chunk; // _tags_chunk is evaluated lazily
1073         if (__tags_chunk.length < 4)
1074             return Value(null);
1075 
1076        size_t offset = 0;
1077        while (offset + 1 < __tags_chunk.length) {
1078            if (__tags_chunk[offset .. offset + 2] == key) {
1079                offset += 2;
1080                return readValue(offset, __tags_chunk);
1081            } else {
1082                offset += 2;
1083                skipValue(offset, __tags_chunk);
1084            }
1085        }
1086        return Value(null);
1087     }
1088 
1089     /// ditto
1090     void opIndexAssign(T)(T value, string key)
1091         if (is(T == Value) || __traits(compiles, GetTypeId!T))
1092     {
1093         static if(is(T == Value)) {
1094             enforce(key.length == 2, "Key length must be 2");
1095             auto __tags_chunk = _tags_chunk;
1096 
1097             _dup();
1098 
1099             size_t offset = 0;
1100             while (offset + 1 < __tags_chunk.length) {
1101                 if (__tags_chunk[offset .. offset + 2] == key) {
1102                     if (value.is_nothing) {
1103                         // special case - remove tag
1104                         removeValueAt(offset);
1105                     } else {
1106                         replaceValueAt(offset + 2, value);
1107                     }
1108                     return;
1109                 } else {
1110                     offset += 2;
1111                     skipValue(offset, __tags_chunk);
1112                 }
1113             }
1114 
1115             if (!value.is_nothing)
1116                 appendTag(key, value);
1117         } else {
1118             opIndexAssign(Value(value), key);
1119         }
1120     }
1121 
1122     /// Append new tag to the end, skipping check if it already exists. $(BIGOH 1)
1123     void appendTag(string key, Value value) {
1124         auto oldlen = _chunk.length;
1125         _chunk.length = _chunk.length + sizeInBytes(value) + 2 * char.sizeof;
1126         _chunk[oldlen .. oldlen + 2] = cast(ubyte[])key;
1127         emplaceValue(_chunk.ptr + oldlen + 2, value);
1128     }
1129 
1130     /// Remove all tags
1131     void clearAllTags() {
1132         _chunk.length = _tags_offset;
1133     }
1134 
1135     /// Number of tags. $(BIGOH number of tags)
1136     size_t tagCount() {
1137         size_t result = 0;
1138         size_t offset = 0;
1139         auto __tags_chunk = _tags_chunk;
1140         while (offset + 1 < __tags_chunk.length) {
1141             offset += 2;
1142             skipValue(offset, __tags_chunk);
1143             result += 1;
1144         }
1145         return result;
1146     }
1147 
1148     // replace existing tag
1149     private void replaceValueAt(size_t offset, Value value) {
1150         // offset points to the beginning of the value
1151         auto begin = offset;
1152         auto __tags_chunk = _tags_chunk;
1153         skipValue(offset, __tags_chunk); // now offset is updated and points to the end
1154         auto end = offset;
1155 
1156         prepareSlice(_chunk, __tags_chunk[begin .. end], sizeInBytes(value));
1157 
1158         emplaceValue(_chunk.ptr + _tags_offset + begin, value);
1159     }
1160 
1161     // remove existing tag
1162     private void removeValueAt(size_t begin) {
1163         // offset points to the beginning of the value
1164         auto offset = begin + 2;
1165         auto __tags_chunk = _tags_chunk;
1166         skipValue(offset, __tags_chunk);
1167         auto end = offset;
1168         // this does the job (see prepareSlice code)
1169         prepareSlice(_chunk, __tags_chunk[begin .. end], 0);
1170     }
1171 
1172     ///  Provides opportunity to iterate over tags.
1173     int opApply(scope int delegate(const ref string k, const ref Value v) dg) const {
1174         size_t offset = 0;
1175         auto __tags_chunk = _tags_chunk;
1176         while (offset + 1 < __tags_chunk.length) {
1177             auto key = cast(string)__tags_chunk[offset .. offset + 2];
1178             offset += 2;
1179             auto val = readValue(offset, __tags_chunk);
1180             auto res = dg(key, val);
1181             if (res != 0) {
1182                 return res;
1183             }
1184         }
1185         return 0;
1186     }
1187 
1188     /// Returns the number of tags. Time complexity is $(BIGOH number of tags)
1189     size_t tagCount() const {
1190         size_t res = 0;
1191         size_t offset = 0;
1192         auto __tags_chunk = _tags_chunk;
1193         while (offset + 1 < __tags_chunk.length) {
1194             offset += 2;
1195             skipValue(offset, __tags_chunk);
1196             res += 1;
1197         }
1198         return res;
1199     }
1200 
1201     private void writeTags(BamWriter writer) {
1202         if (std.system.endian == Endian.littleEndian) {
1203             writer.writeByteArray(_tags_chunk[]);
1204         } else {
1205             fixTagStorageByteOrder();
1206             writer.writeByteArray(_tags_chunk[]);
1207             fixTagStorageByteOrder();
1208         }
1209     }
1210 
1211     // Reads value which starts from (_tags_chunk.ptr + offset) address,
1212     // and updates offset to the end of value. O(1)
1213     private Value readValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
1214         char type = cast(char)tags_chunk[offset++];
1215         return readValueFromArray(type, tags_chunk, offset);
1216     }
1217 
1218     // Increases offset so that it points to the next value. O(1).
1219     private void skipValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
1220         char type = cast(char)tags_chunk[offset++];
1221         if (type == 'Z' || type == 'H') {
1222             while (tags_chunk[offset++] != 0) {}
1223         } else if (type == 'B') {
1224             char elem_type = cast(char)tags_chunk[offset++];
1225             auto length = *(cast(uint*)(tags_chunk.ptr + offset));
1226             offset += uint.sizeof + charToSizeof(elem_type) * length;
1227         } else {
1228             offset += charToSizeof(type);
1229         }
1230     }
1231 
1232     /*
1233       Intended to be used in constructor for initial endianness fixing
1234       in case the library is used on big-endian system.
1235 
1236       NOT TESTED AT ALL!!!
1237     */
1238     private void fixTagStorageByteOrder() {
1239         /* TODO: TEST ON BIG-ENDIAN SYSTEM!!! */
1240         const(ubyte)* p = _tags_chunk.ptr;
1241         const(ubyte)* end = p + _chunk.length;
1242         while (p < end) {
1243             p += 2; // skip tag name
1244             char type = *(cast(char*)p);
1245             ++p; // skip type
1246             if (type == 'Z' || type == 'H') {
1247                 while (*p != 0) { // zero-terminated
1248                     ++p;          // string
1249                 }
1250                 ++p; // skip '\0'
1251             } else if (type == 'B') { // array
1252                 char elem_type = *(cast(char*)p);
1253                 uint size = charToSizeof(elem_type);
1254                 switchEndianness(p, uint.sizeof);
1255                 uint length = *(cast(uint*)p);
1256                 p += uint.sizeof; // skip length
1257                 if (size != 1) {
1258                     for (auto j = 0; j < length; j++) {
1259                         switchEndianness(p, size);
1260                         p += size;
1261                     }
1262                 } else {
1263                     // skip
1264                     p += length;
1265                 }
1266             } else {
1267                 uint size = charToSizeof(type);
1268                 if (size != 1) {
1269                     switchEndianness(p, size);
1270                     p += size;
1271                 } else {
1272                     ++p;
1273                 }
1274             }
1275         }
1276     }
1277 }
1278 
1279 unittest {
1280 
1281     import std.algorithm;
1282     import std.stdio;
1283     import std.math;
1284 
1285     // stderr.writeln("Testing BamRead behaviour...");
1286     auto read = BamRead("readname",
1287                         "AGCTGACTACGTAATAGCCCTA",
1288                         [CigarOperation(22, 'M')]);
1289     assert(read.sequence_length == 22);
1290     assert(read.cigar.length == 1);
1291     assert(read.cigarString() == "22M");
1292     assert(read.name == "readname");
1293     assert(equal(read.sequence(), "AGCTGACTACGTAATAGCCCTA"));
1294 
1295     read.name = "anothername";
1296     assert(read.name == "anothername");
1297     assert(read.cigarString() == "22M");
1298 
1299     read.base_qualities = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1300                            13, 14, 15, 16, 17, 18, 19, 20, 21, 22];
1301     assert(reduce!"a+b"(0, read.base_qualities) == 253);
1302 
1303     read["RG"] = 15;
1304     assert(read["RG"] == 15);
1305 
1306     read["X1"] = [1, 2, 3, 4, 5];
1307     assert(read["X1"] == [1, 2, 3, 4, 5]);
1308 
1309     read.cigar = [CigarOperation(20, 'M'), CigarOperation(2, 'X')];
1310     assert(read.cigarString() == "20M2X");
1311 
1312     read["RG"] = cast(float)5.6;
1313     assert(approxEqual(to!float(read["RG"]), 5.6));
1314 
1315     read.sequence = "AGCTGGCTACGTAATAGCCCT";
1316     assert(read.sequence_length == 21);
1317     assert(read.base_qualities.length == 21);
1318     assert(read.base_qualities[20] == 255);
1319     assert(equal(read.sequence(), "AGCTGGCTACGTAATAGCCCT"));
1320     assert(retro(read.sequence)[2] == 'C');
1321     assert(retro(read.sequence)[0] == 'T');
1322     assert(read.sequence[4] == 'G');
1323     assert(read.sequence[0] == 'A');
1324     assert(equal(read.sequence[0..8], "AGCTGGCT"));
1325     assert(equal(read.sequence[3..5], "TG"));
1326     assert(equal(read.sequence[3..9][1..4], "GGC"));
1327 
1328     read["X1"] = 42;
1329     assert(read["X1"] == 42);
1330 
1331     assert(read.tagCount() == 2);
1332 
1333     read["X1"] = null;
1334     assert(read["X1"].is_nothing);
1335     assert(read.tagCount() == 1);
1336     read.sequence = "GTAAGCTGGCACTAGCAGCCT";
1337     read.cigar = [CigarOperation(read.sequence_length, 'M')];
1338     read["RG"] = null;
1339     read["RG"] = "readgroup1";
1340     assert(read.tagCount() == 1);
1341     read["RG"] = null;
1342     assert(read.tagCount() == 0);
1343 
1344     read.sequence[5] = Base('N');
1345     read.sequence[6] = Base('A');
1346     read.sequence[7] = Base('C');
1347     read.sequence[8] = Base('G');
1348     read.base_qualities[5] = 42;
1349     assert(read.sequence[5 .. 9].equal("NACG"));
1350     assert(read.base_qualities[5] == 42);
1351 
1352     // Test MsgPack serialization/deserialization
1353 
1354     {
1355     import std.typecons;
1356     static import bio.std.hts.thirdparty.msgpack;
1357     auto packer = bio.std.hts.thirdparty.msgpack.packer(Appender!(ubyte[])());
1358     read.toMsgpack(packer);
1359     auto data = packer.stream.data;
1360     auto rec = unpack(data).via.array;
1361     assert(rec[0].as!(ubyte[]) == "anothername");
1362     assert(rec[5].as!(int[]) == [21]);
1363     assert(rec[6].as!(ubyte[]) == ['M']);
1364     assert(rec[10].as!(ubyte[]) == to!string(read.sequence));
1365     }
1366 
1367     read.clearAllTags();
1368     assert(read.tagCount() == 0);
1369 }
1370 
1371 /// $(P BamRead wrapper which precomputes $(D end_position) = $(D position) + $(D basesCovered()).)
1372 ///
1373 /// $(P Computation of basesCovered() takes quite a few cycles. Therefore in places where this
1374 /// property is frequently accessed, it makes sense to precompute it for later use.)
1375 ///
1376 /// $(P The idea is that this should be a drop-in replacement for BamRead in algorithms,
1377 /// as the struct uses 'alias this' construction for the wrapped read.)
1378 struct EagerBamRead(R=BamRead) {
1379     ///
1380     this(R read) {
1381         this.read = read;
1382         this.end_position = read.position + read.basesCovered();
1383     }
1384 
1385     ///
1386     R read;
1387     ///
1388     alias read this;
1389 
1390     /// End position on the reference, computed as position + basesCovered().
1391     int end_position;
1392 
1393     ///
1394     EagerBamRead dup() @property const {
1395         return EagerBamRead(read.dup);
1396     }
1397 }
1398 
1399 static assert(is(EagerBamRead!BamRead : BamRead));
1400 
1401 /// Checks if $(D T) behaves like $(D BamRead)
1402 template isBamRead(T)
1403 {
1404     static if (is(Unqual!T : BamRead))
1405         enum isBamRead = true;
1406     else
1407         enum isBamRead = __traits(compiles,
1408         {
1409             T t; bool p;
1410             p = t.ref_id == 1;          p = t.position == 2;          p = t.bin.id == 3;
1411             p = t.mapping_quality == 4; p = t.flag == 5;              p = t.sequence_length == 6;
1412             p = t.mate_ref_id == 7;     p = t.mate_position == 8;     p = t.template_length == 9;
1413             p = t.is_paired;            p = t.proper_pair;            p = t.is_unmapped;
1414             p = t.mate_is_unmapped;     p = t.mate_is_reverse_strand; p = t.is_first_of_pair;
1415             p = t.is_second_of_pair;    p = t.is_secondary_alignment; p = t.failed_quality_control;
1416             p = t.is_duplicate;         p = t.strand == '+';          p = t.name == "";
1417             p = t.cigar[0].type == 'M'; p = t.basesCovered() > 42;    p = t.cigarString() == "";
1418             p = t.sequence[0] == 'A';   p = t.base_qualities[0] == 0;
1419         });
1420 }
1421 
1422 // Comparison function for 'queryname' sorting order based on https://github.com/samtools/htsjdk/blob/master/src/main/java/htsjdk/samtools/SAMRecordQueryNameComparator.java
1423 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1424     if (isBamRead!R1 && isBamRead!R2)
1425  {
1426     if(a1.name == a2.name)
1427     {
1428         if(a1.is_paired() || a2.is_paired())
1429         {
1430             if(!a1.is_paired())
1431                 return false;
1432             if(!a2.is_paired())
1433                 return true;
1434 
1435             if(a1.is_first_of_pair() && a2.is_second_of_pair())
1436                 return true;
1437 
1438             if(a1.is_second_of_pair() && a2.is_first_of_pair())
1439                 return false;
1440                 
1441         }
1442 
1443         if(a1.strand() != a2.strand())
1444         {
1445             return a1.strand() == '-' ? false : true;
1446         }
1447 
1448         if(a1.is_secondary_alignment() != a2.is_secondary_alignment())
1449         {
1450             return a2.is_secondary_alignment();
1451         }
1452 
1453         if(a1.is_supplementary() != a2.is_supplementary())
1454         {
1455             return a2.is_supplementary();
1456         }
1457 
1458         if(!a1["HI"].is_nothing)
1459         {
1460                 if(a2["HI"].is_nothing)
1461                         return true;
1462 
1463                 int i1 = to!int(a1["HI"]);
1464                 int i2 = to!int(a2["HI"]);
1465                 return i1 < i2;
1466         }
1467         else
1468         if(!a2["HI"].is_nothing)
1469                 return false;
1470     }
1471     return a1.name < a2.name;
1472 }
1473 
1474 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1475     if (isBamRead!R1 && isSomeString!R2)
1476 {
1477     return a1.name < a2;
1478 }
1479 
1480 bool compareReadNamesAsPicard(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1481     if (isSomeString!R1 && isBamRead!R2)
1482 {
1483     return a1 < a2.name;
1484 }
1485 
1486 /// $(P Comparison function for 'queryname' sorting order
1487 /// (return whether first read is 'less' than second))
1488 ///
1489 /// $(P This function can be called on:
1490 ///   $(UL
1491 ///     $(LI two reads)
1492 ///     $(LI read and string in any order)))
1493 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1494     if (isBamRead!R1 && isBamRead!R2)
1495 {
1496     return a1.name < a2.name;
1497 }
1498 
1499 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1500     if (isBamRead!R1 && isSomeString!R2)
1501 {
1502     return a1.name < a2;
1503 }
1504 
1505 bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1506     if (isSomeString!R1 && isBamRead!R2)
1507 {
1508     return a1 < a2.name;
1509 }
1510 
1511 int mixedStrCompare(string a, string b) {
1512   import std.ascii : isDigit;
1513   while (!a.empty && !b.empty) {
1514     if (a.front.isDigit && b.front.isDigit) {
1515       // skip zeros
1516       int za, zb;
1517       while (!a.empty && a.front == '0') { ++za; a.popFront(); }
1518       while (!b.empty && b.front == '0') { ++zb; b.popFront(); }
1519 
1520       // skip equal digits
1521       while (!a.empty && !b.empty && a.front.isDigit && a.front == b.front) {
1522         a.popFront();
1523         b.popFront();
1524       }
1525 
1526       if (!a.empty && !b.empty && a.front.isDigit && b.front.isDigit) {
1527         // the number of leading digits in each string is non-zero
1528         size_t i = 0, maxi = min(a.length, b.length);
1529         while (i < maxi && a[i].isDigit && b[i].isDigit) ++i;
1530         if (i < a.length && a[i].isDigit) return 1; // a contains more digits
1531         if (i < b.length && b[i].isDigit) return -1; // b contains more digits
1532         // the counts are equal, compare first digits
1533         return cast(byte)a.front - cast(byte)b.front;
1534       } else if (!a.empty && a.front.isDigit) return 1;
1535         else if (!b.empty && b.front.isDigit) return -1;
1536         else if (za != zb) return za - zb;  // order by the number of leading zeros
1537     } else {
1538       // lexicographical comparison for non-digits
1539       if (a.front != b.front) return cast(byte)a.front - cast(byte)b.front;
1540       a.popFront(); b.popFront();
1541     }
1542   }
1543   return (!a.empty) ? 1 : (!b.empty) ? -1 : 0;
1544 }
1545 
1546 /// $(P Comparison function for 'queryname' sorting order as in Samtools
1547 /// (returns whether first read is 'less' than second in a 'mixed' order,
1548 ///  i.e. numbers inside the strings are compared by their integer value))
1549 ///
1550 /// $(P This function can be called on:
1551 ///   $(UL
1552 ///     $(LI two reads)
1553 ///     $(LI read and string in any order)))
1554 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1555     if (isBamRead!R1 && isBamRead!R2)
1556 {
1557     return mixedStrCompare(a1.name, a2.name) < 0;
1558 }
1559 
1560 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1561     if (isBamRead!R1 && isSomeString!R2)
1562 {
1563     return mixedStrCompare(a1.name, a2) < 0;
1564 }
1565 
1566 bool mixedCompareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1567     if (isSomeString!R1 && isBamRead!R2)
1568 {
1569     return mixedStrCompare(a1, a2.name) < 0;
1570 }
1571 
1572 unittest {
1573   assert(mixedStrCompare("BC0123", "BC01234") < 0);
1574   assert(mixedStrCompare("BC0123", "BC0123Z") < 0);
1575   assert(mixedStrCompare("BC01234", "BC01234") == 0);
1576   assert(mixedStrCompare("BC0123DEF45", "BC01234DEF45") < 0);
1577   assert(mixedStrCompare("BC01236DEF45", "BC01234DEF45") > 0);
1578   assert(mixedStrCompare("BC012", "BC0012") < 0);
1579   assert(mixedStrCompare("BC0012DE0034", "BC0012DE34") > 0);
1580   assert(mixedStrCompare("BC12DE0034", "BC012DE34") < 0);
1581   assert(mixedStrCompare("1235", "1234") > 0);
1582 }
1583 
1584 // small utility function to get the value of the HI tag and return '0' 
1585 // if it is not defined
1586 private int getHI(R)(auto ref R r) 
1587     if (isBamRead!R)
1588 {
1589         auto v = r["HI"];
1590         if (v.is_nothing)
1591             return 0;
1592         return to!int(v);
1593 }
1594 
1595 /// $(P Comparison function for 'queryname' sorting order setting mates
1596 /// of the same alignments adjacent with the first mate coming before
1597 /// the second mate)
1598 bool compareReadNamesAndMates(R1, R2)(const auto ref R1 r1, const auto ref R2 r2) 
1599     if (isBamRead!R1 && isBamRead!R2)
1600 {
1601     if (r1.name == r2.name) {
1602         if (getHI(r1) == getHI(r2))
1603             return r1.flag() < r2.flag();
1604         return getHI(r1) < getHI(r2);
1605     }
1606     return r1.name < r2.name;
1607 }
1608 
1609 /// $(P Comparison function for 'queryname' sorting order as in Samtools
1610 /// setting mates of the same alignments adjacent with the first mate 
1611 /// coming before the second mate)
1612 bool mixedCompareReadNamesAndMates(R1, R2)(const auto ref R1 r1, const auto ref R2 r2)
1613     if (isBamRead!R1 && isBamRead!R2) 
1614 {
1615     if (mixedStrCompare(r1.name, r2.name) == 0) {
1616         if (getHI(r1) == getHI(r2))
1617             return r1.flag() < r2.flag();
1618         return getHI(r1) < getHI(r2);    
1619     }
1620     return mixedStrCompare(r1.name, r2.name) < 0;
1621 }
1622 
1623 /// $(P Comparison function for 'coordinate' sorting order
1624 /// (returns whether first read is 'less' than second))
1625 ///
1626 /// $(P This function can be called on:
1627 ///   $(UL
1628 ///     $(LI two reads (in this case, reference IDs are also taken into account))
1629 ///     $(LI read and integer in any order)))
1630 
1631 /// This function takes read direction into account (used for original samtools style sorting)
1632 bool compareCoordinatesAndStrand(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1633     if (isBamRead!R1 && isBamRead!R2)
1634 {
1635     if (a1.ref_id == -1) return false; // unmapped reads should be last
1636     if (a2.ref_id == -1) return true;
1637     if (a1.ref_id < a2.ref_id) return true;
1638     if (a1.ref_id > a2.ref_id) return false;
1639     if (a1.position < a2.position) return true;
1640     if (a1.position > a2.position) return false;
1641     return !a1.is_reverse_strand && a2.is_reverse_strand;
1642 }
1643 
1644 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1645     if (isBamRead!R1 && isBamRead!R2)
1646 {
1647     if (a1.ref_id == -1) return false; // unmapped reads should be last
1648     if (a2.ref_id == -1) return true;
1649     if (a1.ref_id < a2.ref_id) return true;
1650     if (a1.ref_id > a2.ref_id) return false;
1651     return (a1.position < a2.position);
1652 }
1653 
1654 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1655     if (isBamRead!R1 && isIntegral!R2)
1656 {
1657     return a1.position < a2;
1658 }
1659 
1660 bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
1661     if (isIntegral!R1 && isBamRead!R2)
1662 {
1663     return a1 < a2.position;
1664 }
1665 
1666 static assert(isTwoWayCompatible!(compareReadNames, BamRead, string));
1667 static assert(isTwoWayCompatible!(compareCoordinates, BamRead, int));
1668 
1669 /// Allows modification of the read in-place even if it's slice-backed.
1670 struct UniqueRead(R) {
1671     R read;
1672     alias read this;
1673 
1674     this(R read) {
1675         this.read = read;
1676         this.read._modify_in_place = true;
1677     }
1678 
1679     ~this() {
1680         this.read._modify_in_place = false;
1681     }
1682 }
1683 
1684 /// ditto
1685 auto assumeUnique(R)(auto ref R read) if (isBamRead!R) {
1686     return UniqueRead!R(read);
1687 }