1 /*
2     New style BAM reader. This file is part of Sambamba.
3     Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
4 
5     Sambamba is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published
7     by the Free Software Foundation; either version 2 of the License,
8     or (at your option) any later version.
9 
10     Sambamba is distributed in the hope that it will be useful, but
11     WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18     02111-1307 USA
19 
20 */
21 
22 // This is a complete rewrite of Artem Tarasov's original reader.
23 
24 module bio.std.experimental.hts.bam.reader;
25 
26 import std.conv;
27 import core.stdc.stdio: fopen, fread, fclose;
28 import std.exception;
29 import std.file;
30 import std.format : format;
31 import std.stdio;
32 import std..string;
33 import std.typecons;
34 import std.bitmanip;
35 
36 //TODO remove these dependecies 
37 import bio.std.hts.bam.cigar;
38 import bio.std.hts.bam.constants;
39 
40 import bio.core.utils.exception;
41 
42 import bio.std.experimental.hts.bgzf;
43 import bio.std.experimental.hts.constants;
44 
45 import bio.std.experimental.hts.bam.header;
46 
47 template ReadFlags(alias flag) {
48   // 0x01: template having multiple segments in sequencing
49   @property bool is_paired()                nothrow { return cast(bool)(flag & 0x1); }
50   // 0x2: Each segment properly aligned according to the aligner
51   @property bool is_proper_pair()           nothrow { return cast(bool)(flag & 0x2); }
52   // 0x4: Segment unmapped
53   @property bool is_unmapped_raw()          nothrow { return cast(bool)(flag & 0x4); }
54   @property bool is_mapped_raw()            nothrow { return cast(bool)(!(flag & 0x4)); }
55   // 0x8: Next segment in template unmapped
56   @property bool mate_is_unmapped()         nothrow { return cast(bool)(flag & 0x8); }
57   // 0x10: SEQ being reverse complimented
58   @property bool is_reverse_strand()        nothrow { return cast(bool)(flag & 0x10); }
59   // 0x20: SEQ of the next segment in the template being reverse complemented
60   @property bool mate_is_reverse_strand()   nothrow { return cast(bool)(flag & 0x20); }
61   // 0x40: The first segment in the template
62   @property bool is_first_of_pair()         nothrow { return cast(bool)(flag & 0x40); }
63   // 0x80: The last segment in the template
64   @property bool is_second_of_pair()        nothrow { return cast(bool)(flag & 0x80); }
65   // 0x100: Secondary segment
66   @property bool is_secondary_alignment()   nothrow { return cast(bool)(flag & 0x100); }
67   // 0x200: Not passing filters, such as platform/vendor quality controls
68   @property bool is_qc_fail() {
69     assert(is_mapped_raw,to!string(this));
70     return cast(bool)(flag & 0x200); }
71   alias is_qc_fail failed_quality_control;
72   /// 0x400: PCR or optical duplicate
73   @property bool is_duplicate()             nothrow { return cast(bool)(flag & 0x400); }
74   /// 0x800: Supplementary alignment
75   @property bool is_supplementary()         nothrow { return cast(bool)(flag & 0x800); }
76   @property string show_flags() {
77     string res = format("b%b-%d",flag,flag);
78     if (is_paired) res ~= " pair";
79     if (is_proper_pair) res ~= " proper";
80     if (is_mapped_raw) res ~= " mapped";
81     if (is_unmapped_raw) res ~= " unmapped";
82     if (mate_is_unmapped) res ~= " mate_unmapped";
83     if (is_reverse_strand) res ~= " rev_strand";
84     if (mate_is_reverse_strand) res ~= " mate_rev_strand";
85     if (is_first_of_pair) res ~= " first_of_pair";
86     if (is_second_of_pair) res ~= " second_of_pair";
87     if (is_secondary_alignment) res ~= " secondary_aln";
88     if (is_mapped_raw && is_qc_fail) res ~= " qc_fail";
89     if (is_duplicate) res ~= " duplicate";
90     if (is_supplementary) res ~= " suppl";
91     return res;
92   }
93 }
94 
95 template CheckMapped(alias refid) {
96   @property nothrow bool is_unmapped() {
97     return is_unmapped_raw;
98   }
99   @property bool is_mapped() {
100     debug {
101       if (is_mapped_raw) {
102         assert(refid != -1, "ref_id can not be -1 for mapped read");  // BAM spec
103       }
104     }
105     return !is_unmapped_raw;
106   }
107 }
108 
109 enum Offset {
110   bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24
111 };
112 
113 /**
114    Raw Read buffer containing unparsed data. It should be considered
115    read-only.
116 
117    Current implementation is a cluct (class-struct hybrid). The _data
118    pointer is shared when ReadBlob is assigned to another variable
119    (i.e., there is a remote dependency). The advantage is that for
120    each Read data gets allocated on the heap only once.
121 
122    All offsets are indexed on init (except for tags).  When using
123    fields beyond refid,pos use ProcessReadBlob instead because it
124    caches values.
125 */
126 
127 struct ReadBlob {
128   RefId refid;   // -1 is invalid (BAM Spec)
129   GenomePos pos; // 0 coordinate based (BAM spec)
130   private ubyte[] _data;
131   uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max;
132 
133   mixin ReadFlags!(_flag);
134   mixin CheckMapped!(refid);
135 
136   /*
137   this(RefId ref_id, GenomePos read_pos, ubyte[] buf) {
138     refid = ref_id;
139     pos = read_pos;
140     _data = buf;
141   }
142   */
143 
144   @property void cleanup() {
145     destroy(_data);
146     _data = null;
147   }
148 
149   // Turn ReadBlob into class-struct hybrid or a cluct ;)
150   // @disable this(this); // disable copy semantics;
151 
152   @property @trusted nothrow private const T fetch(T)(uint raw_offset) {
153     ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof];
154     return cast(const(T))buf.read!(T,Endian.littleEndian)();
155   }
156 
157   @property @trusted nothrow private const
158   uint _bin_mq_nl()        { return fetch!uint(Offset.bin_mq_nl); }
159   @property @trusted nothrow private const
160   uint _flag_nc()          { return fetch!uint(Offset.flag_nc); }
161   @property @trusted nothrow private const
162   int sequence_length()    { return fetch!int(Offset.l_seq); }
163   @property @trusted nothrow private const
164   int _next_refID()        { return fetch!int(Offset.next_refID); }
165   @property @trusted nothrow private const
166   int _next_pos()          { return fetch!int(Offset.next_pos); }
167   @property @trusted nothrow private const
168   int _tlen()              { return fetch!int(Offset.tlen); } // avoid using TLEN
169   @property @trusted nothrow private const
170   ushort _bin()            { return _bin_mq_nl >> 16; }
171   @property @trusted nothrow private const
172   ubyte _mapq()            { return (_bin_mq_nl >> 8) & 0xFF; }
173   @property @trusted nothrow private const
174   ubyte _l_read_name()     { return _bin_mq_nl & 0xFF; }
175   @property @trusted nothrow private const
176   ushort _flag()          { return fetch!ushort(Offset.flag); }
177   @property @trusted nothrow private const
178   ushort _n_cigar_op()     { return _flag_nc & 0xFFFF; }
179   @property @trusted nothrow private const
180   uint _read_name_offset() { return Offset.read_name; }
181   @property @trusted nothrow private
182   uint _cigar_offset()       {
183     if (offset_cigar == int.max)
184       offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof);
185     return offset_cigar;
186   }
187   @property @trusted nothrow private
188   uint _seq_offset()       {
189     if (offset_seq == int.max)
190       offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof);
191     return offset_seq;
192   }
193   @property @trusted nothrow private
194   uint _qual_offset()      {
195     if (offset_qual == int.max)
196       offset_qual = _seq_offset + (sequence_length + 1)/2;
197     return offset_qual;
198   }
199   @property @trusted nothrow private
200   uint _tags_offset()      { return _qual_offset + sequence_length; }
201   @property @trusted nothrow private
202   ubyte[] read_name()      { return _data[_read_name_offset.._cigar_offset]; }
203   @property @trusted nothrow private
204   ubyte[] raw_cigar()      { return _data[_cigar_offset.._seq_offset]; }
205   @property @trusted nothrow private
206   ubyte[] raw_sequence()   { return _data[_seq_offset.._qual_offset]; }
207 
208   alias sequence_length _l_seq;
209   alias _mapq mapping_quality; // MAPQ
210 
211   string toString() {
212     return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">";
213   }
214 
215 }
216 
217 /**
218    ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use
219    this when you need to access field/elements multiple times. Note
220    that ProcessReadBlob becomes invalid when ReadBlob goes out of scope.
221 */
222 struct ProcessReadBlob {
223   private Nullable!ReadBlob _read2;
224   Nullable!int sequence_length2;
225   private Nullable!string sequence2, read_name2;
226   private Nullable!CigarOperations cigar2;
227   private Nullable!GenomePos consumed_reference_bases2;
228 
229   mixin ReadFlags!(_flag);
230   mixin CheckMapped!(ref_id);
231 
232   this(Nullable!ReadBlob _r) {
233     _read2 = _r;
234   }
235 
236   @property void cleanup() {
237     _read2.cleanup;
238   }
239 
240   @property nothrow bool isNull() {
241     return _read2.isNull;
242   }
243 
244   @property RefId ref_id() {
245     enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2));
246     return _read2.refid;
247   }
248 
249   @property RefId raw_ref_id() {
250     return _read2.refid;
251   }
252 
253   @property nothrow uint _flag_nc() {
254     return _read2._flag_nc;
255   }
256 
257   @property nothrow ushort _flag() {
258     return _read2._flag;
259   }
260 
261   alias ref_id refid;
262 
263   /// Get the start position on the reference sequence (better use
264   /// start_loc), i.e., the first base that gets consumed in the
265   /// CIGAR.
266   @property GenomePos start_pos() {
267     assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
268     asserte(_read2.pos < GenomePos.max);
269     return cast(GenomePos)_read2.pos;
270   }
271 
272   @property GenomePos raw_start_pos() {
273     return cast(GenomePos)_read2.pos;
274   }
275 
276   /// Get the end position on the reference sequence (better use end_loc)
277   @property GenomePos end_pos() {
278     assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence");
279     assert(!consumed_reference_bases.isNull);
280     return start_pos + consumed_reference_bases;
281   }
282 
283   @property GenomePos raw_end_pos() {
284     return raw_start_pos + consumed_reference_bases;
285   }
286 
287   @property GenomeLocation start_loc() {
288     return GenomeLocation(ref_id,start_pos);
289   }
290 
291   @property GenomeLocation end_loc() {
292     return GenomeLocation(ref_id,end_pos);
293   }
294 
295   @property @trusted MappingQuality mapping_quality() { // MAPQ
296     assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
297     return MappingQuality(_read2.mapping_quality);
298   }
299 
300   @property @trusted int tlen() { // do not use
301     return _read2._tlen;
302   }
303 
304   @property @trusted GenomePos sequence_length() {
305     if (sequence_length2.isNull)
306       sequence_length2 = _read2.sequence_length;
307     return sequence_length2;
308   }
309 
310   /// Count and caches consumed reference bases. Uses raw_cigar to
311   /// avoid a heap allocation.
312   @property @trusted Nullable!GenomePos consumed_reference_bases() {
313     if (consumed_reference_bases2.isNull) {
314       assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
315       assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
316       auto raw = cast(uint[]) _read2.raw_cigar();
317       if (raw.length==1 && raw[0] == '*')
318         return consumed_reference_bases2; // null
319       else {
320         GenomePos bases = 0;
321         for (size_t i = 0; i < raw.length; i++) {
322           auto cigarop = CigarOperation(raw[i]);
323           if (cigarop.is_reference_consuming)
324             bases += cigarop.length;
325         }
326         consumed_reference_bases2 = bases;
327       }
328     }
329     return consumed_reference_bases2;
330   }
331 
332   /// Count query consumed bases. Uses raw_cigar to avoid a heap
333   /// allocation.
334   @property @trusted GenomePos consumed_query_bases() {
335     assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
336     assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
337     auto raw = cast(uint[]) _read2.raw_cigar();
338     if (raw.length==1 && raw[0] == '*')
339       return consumed_reference_bases2; // null
340     else {
341       GenomePos bases = 0;
342       for (size_t i = 0; i < raw.length; i++) {
343         auto cigarop = CigarOperation(raw[i]);
344         if (cigarop.is_query_consuming)
345           bases += cigarop.length;
346       }
347       return bases;
348     }
349   }
350 
351   /// Return read name as a string. If unavailable returns
352   /// null. Caches name.
353   @property Nullable!string read_name() {
354     if (read_name2.isNull) {
355       assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
356       auto raw = _read2.read_name;
357       if (raw.length == 0 || (raw.length ==1 && raw[0] == '*'))
358         return read_name2; // null
359       assert(raw.length < 255); // BAM spec
360       if (raw[raw.length-1] == 0) // strip trailing C zero
361         raw.length -= 1;
362       read_name2 = Nullable!string(cast(string)raw);
363     }
364     return read_name2;
365   }
366 
367   /// Returns Cigar as an array of operations. Returns null if no
368   /// operations. Caches Cigar when there are operations.
369   @property Nullable!CigarOperations cigar() {
370     if (cigar2.isNull) {
371       assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
372       assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
373       auto raw = cast(uint[]) _read2.raw_cigar();
374       if (raw.length==0 || (raw.length==1 && raw[0] == '*'))
375         return cigar2; // null
376       else {
377         auto s = new CigarOperation[raw.length]; // Heap alloc
378         s.length = 0;
379         for (size_t i = 0; i < raw.length; i++) {
380           s ~= CigarOperation(raw[i]);
381         }
382         cigar2 = s;
383       }
384     }
385     return cigar2;
386   }
387 
388   /// Return human readable sequence fragment - null if
389   /// undefined. Caches sequence.
390   @property Nullable!string sequence() {
391     if (sequence2.isNull) { // is it cached in sequence2?
392       auto raw = _read2.raw_sequence();
393       if (raw[0] == '*') {
394         assert(raw.length == 1);
395         return sequence2; // null
396       }
397       auto raw_length = (sequence_length + 1) / 2;
398       char[16] convert = "=ACMGRSVTWYHKDBN";
399       string s;
400       s.reserve(sequence_length); // Heap alloc
401       for (size_t i = 0; i < sequence_length; i++) {
402         auto is_odd = i % 2;
403         auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4);
404         s ~= convert[nuc];
405       }
406       sequence2 = s;
407     }
408     return sequence2;
409   }
410 
411   @property ubyte[] toBlob() {
412     return _read2._data;
413   }
414 
415   @property string posString() {
416     return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped");
417   }
418 
419   string toString() {
420     // return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
421     return _read2.get.toString();
422   }
423 
424 }
425 
426 /**
427    BamReader2 is used for foreach loops
428 */
429 
430 struct BamReadBlobs {
431   BgzfStream stream;
432   BamHeader header;
433 
434   this(string fn) {
435     stream = BgzfStream(fn);
436   }
437 
438   int opApply(scope int delegate(ref ReadBlob) dg) {
439     fetch_bam_header(header, stream);
440     // parse the reads
441     while (!stream.eof()) {
442       immutable block_size = stream.read!int();
443       immutable refid = stream.read!int();
444       immutable pos = stream.read!int();
445 
446       ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
447       auto read = ReadBlob(refid,pos,stream.read(data));
448       dg(read);
449     }
450     return 0;
451   }
452 }
453 
454 /**
455    Read streamer - use on single thread only
456 */
457 
458 // import core.memory : pureMalloc;
459 
460 struct BamReadBlobStream {
461   BgzfStream stream;
462   BamHeader header;
463   Nullable!ReadBlob readbuf; // points to current read
464   ubyte[] data; // in sync with readbuf
465 
466   this(string fn) {
467     stream = BgzfStream(fn);
468     fetch_bam_header(header, stream);
469     if (!empty)
470       popFront(); // preload front
471   }
472 
473   bool empty() @property {
474     return stream.eof();
475   }
476 
477   // Returns first read available. If past eof returns null.
478   Nullable!ReadBlob front() {
479     return readbuf;
480   }
481 
482   void popFront() {
483     asserte(!empty); // should have been checked for
484     immutable block_size = stream.read!int();
485     immutable refid = stream.read!int();
486     immutable pos = stream.read!int();
487 
488     // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
489     data = new ubyte[block_size-2*int.sizeof];
490     readbuf = ReadBlob(refid,pos,stream.read(data));
491     assert(readbuf._data.ptr == data.ptr);
492   }
493 
494 }
495 
496 /**
497    Reader - use on single thread only
498 
499    This one provides peek support. Peek looks one read ahead in the read stream.
500 */
501 
502 struct BamBlobReader {
503   BgzfStream stream;
504   BamHeader header;
505   Nullable!ReadBlob peekbuf; // points to current read
506   // ubyte[] data; // in sync with peekbuf
507 
508   this(string fn) {
509     stream = BgzfStream(fn);
510     fetch_bam_header(header, stream);
511   }
512 
513   bool empty() @property {
514     return peekbuf.isNull && stream.eof();
515   }
516 
517   Nullable!ReadBlob peek() {
518     if (peekbuf.isNull && !empty)
519       fetch();
520     return peekbuf;
521   }
522 
523   /// Fetches the next read. If the peekbuf is not empty return that
524   /// first and reset peekbuf.
525   Nullable!ReadBlob fetch() {
526     if (!peekbuf.isNull) {
527       auto readbuf = peekbuf;
528       peekbuf = Nullable!ReadBlob();
529       return readbuf;
530     }
531     asserte(!empty); // should have been checked for
532     immutable block_size = stream.read!int();
533     immutable refid = stream.read!int();
534     immutable pos = stream.read!int();
535 
536     // void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
537     auto data = new ubyte[block_size-2*int.sizeof];
538     peekbuf = ReadBlob(refid,pos,stream.read(data));
539     return peekbuf;
540   }
541 
542   /// Returns the next matching read. Otherwise null
543   ///
544   /// Example:
545   ///
546   ///    auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
547   /*
548   Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) {
549     while(!empty()) {
550       auto readbuf = read();
551       if (is_match(R(readbuf)))
552         return readbuf;
553       else
554         return Nullable!ReadBlob();
555     }
556     return Nullable!ReadBlob();
557   }
558   */
559 }