1 /*
2     New style BAM reader. This file is part of Sambamba.
3     Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
4 
5     Sambamba is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published
7     by the Free Software Foundation; either version 2 of the License,
8     or (at your option) any later version.
9 
10     Sambamba is distributed in the hope that it will be useful, but
11     WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18     02111-1307 USA
19 
20 */
21 
22 // This is a complete rewrite of Artem Tarasov's original reader.
23 
24 module bio2.bam.reader;
25 
26 import std.conv;
27 import core.stdc.stdio: fopen, fread, fclose;
28 import std.exception;
29 import std.file;
30 import std.format : format;
31 import std.stdio;
32 import std.string;
33 import std.typecons;
34 import std.bitmanip;
35 
36 import bio.bam.cigar;
37 import bio.bam.constants;
38 import bio.core.utils.exception;
39 
40 import bio2.bgzf;
41 import bio2.constants;
42 
43 import bio2.bam.header;
44 
45 template ReadFlags(alias flag) {
46   @property bool is_paired()                nothrow { return cast(bool)(flag & 0x1); }
47   /// Each segment properly aligned according to the aligner
48   @property bool is_proper_pair()           nothrow { return cast(bool)(flag & 0x2); }
49   @property bool is_unmapped_raw()          nothrow { return cast(bool)(flag & 0x4); }
50   @property bool is_mapped_raw()            nothrow { return cast(bool)(!(flag & 0x4)); }
51   @property bool mate_is_unmapped()         nothrow { return cast(bool)(flag & 0x8); }
52   @property bool is_reverse_strand()        nothrow { return cast(bool)(flag & 0x10); }
53   @property bool mate_is_reverse_strand()   nothrow { return cast(bool)(flag & 0x20); }
54   @property bool is_first_of_pair()         nothrow { return cast(bool)(flag & 0x40); }
55   @property bool is_second_of_pair()        nothrow { return cast(bool)(flag & 0x80); }
56   @property bool is_secondary_alignment()   nothrow { return cast(bool)(flag & 0x100); }
57   @property bool is_qc_fail() {
58     assert(is_mapped_raw,to!string(this));
59     return cast(bool)(flag & 0x200); }
60   alias is_qc_fail failed_quality_control;
61   /// PCR or optical duplicate
62   @property bool is_duplicate()             nothrow { return cast(bool)(flag & 0x400); }
63   /// Supplementary alignment
64   @property bool is_supplementary()         nothrow { return cast(bool)(flag & 0x800); }
65   @property string show_flags() {
66     string res = format("b%b-%d",flag,flag);
67     if (is_paired) res ~= " pair";
68     if (is_proper_pair) res ~= " proper";
69     if (is_mapped_raw) res ~= " mapped";
70     if (is_unmapped_raw) res ~= " unmapped";
71     if (mate_is_unmapped) res ~= " mate_unmapped";
72     if (is_reverse_strand) res ~= " rev_strand";
73     if (mate_is_reverse_strand) res ~= " mate_rev_strand";
74     if (is_first_of_pair) res ~= " first_of_pair";
75     if (is_second_of_pair) res ~= " second_of_pair";
76     if (is_secondary_alignment) res ~= " secondary_aln";
77     if (is_mapped_raw && is_qc_fail) res ~= " qc_fail";
78     if (is_duplicate) res ~= " duplicate";
79     if (is_supplementary) res ~= " suppl";
80     return res;
81   }
82 }
83 
84 template CheckMapped(alias refid) {
85   @property nothrow bool is_unmapped() {
86     return is_unmapped_raw;
87   }
88   @property bool is_mapped() {
89     debug {
90       if (is_mapped_raw) {
91         assert(refid != -1, "ref_id can not be -1 for mapped read");  // BAM spec
92       }
93     }
94     return !is_unmapped_raw;
95   }
96 }
97 
98 enum Offset {
99   bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24
100 };
101 
102 /**
103    Raw Read buffer containing unparsed data. It should be considered
104    read-only.
105 
106    Current implementation is a cluct (class-struct hybrid). The _data
107    pointer is shared when ReadBlob is assigned to another variable
108    (i.e., there is a remote dependency). The advantage is that for
109    each Read data gets allocated on the heap only once.
110 
111    All offsets are indexed on init (except for tags).  When using
112    fields beyond refid,pos use ProcessReadBlob instead because it
113    caches values.
114 */
115 
116 struct ReadBlob {
117   RefId refid;   // -1 is invalid (BAM Spec)
118   GenomePos pos; // 0 coordinate based (BAM spec)
119   private ubyte[] _data;
120   uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max;
121 
122   mixin ReadFlags!(_flag);
123   mixin CheckMapped!(refid);
124 
125   /*
126   this(RefId ref_id, GenomePos read_pos, ubyte[] buf) {
127     refid = ref_id;
128     pos = read_pos;
129     _data = buf;
130   }
131   */
132 
133   @property void cleanup() {
134     delete _data;
135     _data = null;
136   }
137 
138   // Turn ReadBlob into class-struct hybrid or a cluct ;)
139   // @disable this(this); // disable copy semantics;
140 
141   @property @trusted nothrow private const T fetch(T)(uint raw_offset) {
142     ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof];
143     return cast(const(T))buf.read!(T,Endian.littleEndian)();
144   }
145 
146   @property @trusted nothrow private const
147   uint _bin_mq_nl()        { return fetch!uint(Offset.bin_mq_nl); }
148   @property @trusted nothrow private const
149   uint _flag_nc()          { return fetch!uint(Offset.flag_nc); }
150   @property @trusted nothrow private const
151   int sequence_length()    { return fetch!int(Offset.l_seq); }
152   @property @trusted nothrow private const
153   int _next_refID()        { return fetch!int(Offset.next_refID); }
154   @property @trusted nothrow private const
155   int _next_pos()          { return fetch!int(Offset.next_pos); }
156   @property @trusted nothrow private const
157   int _tlen()              { return fetch!int(Offset.tlen); } // avoid using TLEN
158   @property @trusted nothrow private const
159   ushort _bin()            { return _bin_mq_nl >> 16; }
160   @property @trusted nothrow private const
161   ubyte _mapq()            { return (_bin_mq_nl >> 8) & 0xFF; }
162   @property @trusted nothrow private const
163   ubyte _l_read_name()     { return _bin_mq_nl & 0xFF; }
164   @property @trusted nothrow private const
165   ushort _flag()          { return fetch!ushort(Offset.flag); }
166   @property @trusted nothrow private const
167   ushort _n_cigar_op()     { return _flag_nc & 0xFFFF; }
168   @property @trusted nothrow private const
169   uint _read_name_offset() { return Offset.read_name; }
170   @property @trusted nothrow private
171   uint _cigar_offset()       {
172     if (offset_cigar == int.max)
173       offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof);
174     return offset_cigar;
175   }
176   @property @trusted nothrow private
177   uint _seq_offset()       {
178     if (offset_seq == int.max)
179       offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof);
180     return offset_seq;
181   }
182   @property @trusted nothrow private
183   uint _qual_offset()      {
184     if (offset_qual == int.max)
185       offset_qual = _seq_offset + (sequence_length + 1)/2;
186     return offset_qual;
187   }
188   @property @trusted nothrow private
189   uint _tags_offset()      { return _qual_offset + sequence_length; }
190   @property @trusted nothrow private
191   ubyte[] read_name()      { return _data[_read_name_offset.._cigar_offset]; }
192   @property @trusted nothrow private
193   ubyte[] raw_cigar()      { return _data[_cigar_offset.._seq_offset]; }
194   @property @trusted nothrow private
195   ubyte[] raw_sequence()   { return _data[_seq_offset.._qual_offset]; }
196 
197   alias sequence_length _l_seq;
198   alias _mapq mapping_quality; // MAPQ
199 
200   string toString() {
201     return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">";
202   }
203 
204 }
205 
206 /**
207    ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use
208    this when you need to access field/elements multiple times. Note
209    that ProcessReadBlob becomes invalid when ReadBlob goes out of scope.
210 */
211 struct ProcessReadBlob {
212   private Nullable!ReadBlob _read2;
213   Nullable!int sequence_length2;
214   private Nullable!string sequence2, read_name2;
215   private Nullable!CigarOperations cigar2;
216   private Nullable!GenomePos consumed_reference_bases2;
217 
218   mixin ReadFlags!(_flag);
219   mixin CheckMapped!(ref_id);
220 
221   this(Nullable!ReadBlob _r) {
222     _read2 = _r;
223   }
224 
225   @property void cleanup() {
226     _read2.cleanup;
227   }
228 
229   @property nothrow bool isNull() {
230     return _read2.isNull;
231   }
232 
233   @property RefId ref_id() {
234     enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2));
235     return _read2.refid;
236   }
237 
238   @property RefId raw_ref_id() {
239     return _read2.refid;
240   }
241 
242   @property nothrow uint _flag_nc() {
243     return _read2._flag_nc;
244   }
245 
246   @property nothrow ushort _flag() {
247     return _read2._flag;
248   }
249 
250   alias ref_id refid;
251 
252   /// Get the start position on the reference sequence (better use
253   /// start_loc), i.e., the first base that gets consumed in the
254   /// CIGAR.
255   @property GenomePos start_pos() {
256     assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
257     asserte(_read2.pos < GenomePos.max);
258     return cast(GenomePos)_read2.pos;
259   }
260 
261   @property GenomePos raw_start_pos() {
262     return cast(GenomePos)_read2.pos;
263   }
264 
265   /// Get the end position on the reference sequence (better use end_loc)
266   @property GenomePos end_pos() {
267     assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence");
268     assert(!consumed_reference_bases.isNull);
269     return start_pos + consumed_reference_bases;
270   }
271 
272   @property GenomePos raw_end_pos() {
273     return raw_start_pos + consumed_reference_bases;
274   }
275 
276   @property GenomeLocation start_loc() {
277     return GenomeLocation(ref_id,start_pos);
278   }
279 
280   @property GenomeLocation end_loc() {
281     return GenomeLocation(ref_id,end_pos);
282   }
283 
284   @property @trusted MappingQuality mapping_quality() { // MAPQ
285     assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
286     return MappingQuality(_read2.mapping_quality);
287   }
288 
289   @property @trusted int tlen() { // do not use
290     return _read2._tlen;
291   }
292 
293   @property @trusted GenomePos sequence_length() {
294     if (sequence_length2.isNull)
295       sequence_length2 = _read2.sequence_length;
296     return sequence_length2;
297   }
298 
299   /// Count and caches consumed reference bases. Uses raw_cigar to
300   /// avoid a heap allocation.
301   @property @trusted Nullable!GenomePos consumed_reference_bases() {
302     if (consumed_reference_bases2.isNull) {
303       assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
304       assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
305       auto raw = cast(uint[]) _read2.raw_cigar();
306       if (raw.length==1 && raw[0] == '*')
307         return consumed_reference_bases2; // null
308       else {
309         GenomePos bases = 0;
310         for (size_t i = 0; i < raw.length; i++) {
311           auto cigarop = CigarOperation(raw[i]);
312           if (cigarop.is_reference_consuming)
313             bases += cigarop.length;
314         }
315         consumed_reference_bases2 = bases;
316       }
317     }
318     return consumed_reference_bases2;
319   }
320 
321   /// Count query consumed bases. Uses raw_cigar to avoid a heap
322   /// allocation.
323   @property @trusted GenomePos consumed_query_bases() {
324     assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
325     assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
326     auto raw = cast(uint[]) _read2.raw_cigar();
327     if (raw.length==1 && raw[0] == '*')
328       return consumed_reference_bases2; // null
329     else {
330       GenomePos bases = 0;
331       for (size_t i = 0; i < raw.length; i++) {
332         auto cigarop = CigarOperation(raw[i]);
333         if (cigarop.is_query_consuming)
334           bases += cigarop.length;
335       }
336       return bases;
337     }
338   }
339 
340   /// Return read name as a string. If unavailable returns
341   /// null. Caches name.
342   @property Nullable!string read_name() {
343     if (read_name2.isNull) {
344       assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
345       auto raw = _read2.read_name;
346       if (raw.length == 0 || (raw.length ==1 && raw[0] == '*'))
347         return read_name2; // null
348       assert(raw.length < 255); // BAM spec
349       if (raw[raw.length-1] == 0) // strip trailing C zero
350         raw.length -= 1;
351       read_name2 = Nullable!string(cast(string)raw);
352     }
353     return read_name2;
354   }
355 
356   /// Returns Cigar as an array of operations. Returns null if no
357   /// operations. Caches Cigar when there are operations.
358   @property Nullable!CigarOperations cigar() {
359     if (cigar2.isNull) {
360       assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
361       assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
362       auto raw = cast(uint[]) _read2.raw_cigar();
363       if (raw.length==0 || (raw.length==1 && raw[0] == '*'))
364         return cigar2; // null
365       else {
366         auto s = new CigarOperation[raw.length]; // Heap alloc
367         s.length = 0;
368         for (size_t i = 0; i < raw.length; i++) {
369           s ~= CigarOperation(raw[i]);
370         }
371         cigar2 = s;
372       }
373     }
374     return cigar2;
375   }
376 
377   /// Return human readable sequence fragment - null if
378   /// undefined. Caches sequence.
379   @property Nullable!string sequence() {
380     if (sequence2.isNull) { // is it cached in sequence2?
381       auto raw = _read2.raw_sequence();
382       if (raw[0] == '*') {
383         assert(raw.length == 1);
384         return sequence2; // null
385       }
386       auto raw_length = (sequence_length + 1) / 2;
387       char[16] convert = "=ACMGRSVTWYHKDBN";
388       string s;
389       s.reserve(sequence_length); // Heap alloc
390       for (size_t i = 0; i < sequence_length; i++) {
391         auto is_odd = i % 2;
392         auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4);
393         s ~= convert[nuc];
394       }
395       sequence2 = s;
396     }
397     return sequence2;
398   }
399 
400   @property ubyte[] toBlob() {
401     return _read2._data;
402   }
403 
404   @property string posString() {
405     return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped");
406   }
407 
408   string toString() {
409     // return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
410     return _read2.get.toString();
411   }
412 
413 }
414 
415 /**
416    BamReader2 is used for foreach loops
417 */
418 
419 struct BamReadBlobs {
420   BgzfStream stream;
421   BamHeader header;
422 
423   this(string fn) {
424     stream = BgzfStream(fn);
425   }
426 
427   int opApply(scope int delegate(ref ReadBlob) dg) {
428     fetch_bam_header(header, stream);
429     // parse the reads
430     while (!stream.eof()) {
431       immutable block_size = stream.read!int();
432       immutable refid = stream.read!int();
433       immutable pos = stream.read!int();
434 
435       ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
436       auto read = ReadBlob(refid,pos,stream.read(data));
437       dg(read);
438     }
439     return 0;
440   }
441 }
442 
443 /**
444    Read streamer - use on single thread only
445 */
446 
447 // import core.memory : pureMalloc;
448 
449 struct BamReadBlobStream {
450   BgzfStream stream;
451   BamHeader header;
452   Nullable!ReadBlob readbuf; // points to current read
453   ubyte[] data; // in sync with readbuf
454 
455   this(string fn) {
456     stream = BgzfStream(fn);
457     fetch_bam_header(header, stream);
458     if (!empty)
459       popFront(); // preload front
460   }
461 
462   bool empty() @property {
463     return stream.eof();
464   }
465 
466   // Returns first read available. If past eof returns null.
467   Nullable!ReadBlob front() {
468     return readbuf;
469   }
470 
471   void popFront() {
472     asserte(!empty); // should have been checked for
473     immutable block_size = stream.read!int();
474     immutable refid = stream.read!int();
475     immutable pos = stream.read!int();
476 
477     // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
478     data = new ubyte[block_size-2*int.sizeof];
479     readbuf = ReadBlob(refid,pos,stream.read(data));
480     assert(readbuf._data.ptr == data.ptr);
481   }
482 
483 }
484 
485 /**
486    Reader - use on single thread only
487 
488    This one provides peek support. Peek looks one read ahead in the read stream.
489 */
490 
491 struct BamBlobReader {
492   BgzfStream stream;
493   BamHeader header;
494   Nullable!ReadBlob peekbuf; // points to current read
495   // ubyte[] data; // in sync with peekbuf
496 
497   this(string fn) {
498     stream = BgzfStream(fn);
499     fetch_bam_header(header, stream);
500   }
501 
502   bool empty() @property {
503     return peekbuf.isNull && stream.eof();
504   }
505 
506   Nullable!ReadBlob peek() {
507     if (peekbuf.isNull && !empty)
508       fetch();
509     return peekbuf;
510   }
511 
512   /// Fetches the next read. If the peekbuf is not empty return that
513   /// first and reset peekbuf.
514   Nullable!ReadBlob fetch() {
515     if (!peekbuf.isNull) {
516       auto readbuf = peekbuf;
517       peekbuf = Nullable!ReadBlob();
518       return readbuf;
519     }
520     asserte(!empty); // should have been checked for
521     immutable block_size = stream.read!int();
522     immutable refid = stream.read!int();
523     immutable pos = stream.read!int();
524 
525     // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
526     auto data = new ubyte[block_size-2*int.sizeof];
527     peekbuf = ReadBlob(refid,pos,stream.read(data));
528     return peekbuf;
529   }
530 
531   /// Returns the next matching read. Otherwise null
532   ///
533   /// Example:
534   ///
535   ///    auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
536   /*
537   Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) {
538     while(!empty()) {
539       auto readbuf = read();
540       if (is_match(R(readbuf)))
541         return readbuf;
542       else
543         return Nullable!ReadBlob();
544     }
545     return Nullable!ReadBlob();
546   }
547   */
548 }