1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2016    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11 
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14 
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 
25 module bio.std.hts.bam.cigar;
26 
27 import std.algorithm;
28 import std.range;
29 import std.conv;
30 import std.format;
31 import std.exception;
32 import std.system;
33 import std.traits;
34 import std.array;
35 import std.bitmanip;
36 import core.stdc.stdlib;
37 
38 import bio.core.base;
39 import bio.core.utils.format;
40 
41 import bio.std.hts.bam.abstractreader;
42 
43 import bio.std.hts.bam.writer;
44 import bio.std.hts.bam.tagvalue;
45 import bio.std.hts.bam.bai.bin;
46 
47 import bio.std.hts.bam.md.core;
48 
49 import bio.std.hts.utils.array;
50 import bio.std.hts.utils.value;
51 import bio.core.utils.switchendianness;
52 
53 import bio.std.hts.thirdparty.msgpack : Packer, unpack;
54 
55 /**
56   Represents single CIGAR operation
57  */
58 struct CigarOperation {
59     static assert(CigarOperation.sizeof == uint.sizeof);
60     /*
61         WARNING!
62 
63       It is very essential that the size of
64       this struct is EXACTLY equal to uint.sizeof!
65 
66       The reason is to avoid copying of arrays during alignment parsing.
67 
68       Namely, when some_pointer points to raw cigar data,
69       we can just do a cast. This allows to access those data
70       directly, not doing any memory allocations.
71     */
72 
73     private uint raw; // raw data from BAM
74 
75     private static ubyte char2op(char c) {
76         switch(c) {
77             case 'M': return 0;
78             case 'I': return 1;
79             case 'D': return 2;
80             case 'N': return 3;
81             case 'S': return 4;
82             case 'H': return 5;
83             case 'P': return 6;
84             case '=': return 7;
85             case 'X': return 8;
86             default:  return 15; // 15 is used as invalid value
87         }
88     }
89 
90     /// Length must be strictly less than 2^28.
91     /// $(BR)
92     /// Operation type must be one of M, I, D, N, S, H, P, =, X.
93     this(uint length, char operation_type) {
94         enforce(length < (1<<28), "Too big length of CIGAR operation");
95         raw = (length << 4) | char2op(operation_type);
96     }
97 
98     this(uint _raw) {
99         raw = _raw;
100     }
101 
102     /// Operation length
103     uint length() @property const nothrow @nogc {
104         return raw >> 4;
105     }
106 
107     /// CIGAR operation as one of MIDNSHP=X.
108     /// Absent or invalid operation is represented by '?'
109     char type() @property const nothrow @nogc {
110         return "MIDNSHP=X????????"[raw & 0xF];
111     }
112 
113     // Each pair of bits has first bit set iff the operation is query consuming,
114     // and second bit set iff it is reference consuming.
115     //                                            X  =  P  H  S  N  D  I  M
116     private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
117 
118     /// True iff operation is one of M, =, X, I, S
119     bool is_query_consuming() @property const nothrow @nogc {
120         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
121     }
122 
123     /// True iff operation is one of M, =, X, D, N
124     bool is_reference_consuming() @property const nothrow @nogc {
125         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
126     }
127 
128     /// True iff operation is one of M, =, X
129     bool is_match_or_mismatch() @property const nothrow @nogc {
130         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
131     }
132 
133     /// True iff operation is one of 'S', 'H'
134     bool is_clipping() @property const nothrow @nogc {
135         return ((raw & 0xF) >> 1) == 2; // 4 or 5
136     }
137 
138     void toSam(Sink)(auto ref Sink sink) const
139         if (isSomeSink!Sink)
140     {
141         sink.write(length);
142         sink.write(type);
143     }
144 
145     void toString(scope void delegate(const(char)[]) sink) const {
146         toSam(sink);
147     }
148 }
149 
150 alias CigarOperation[] CigarOperations;
151 
152 bool is_unavailable(CigarOperations cigar) @property nothrow @nogc {
153   return (cigar.length == 1 && cigar[0].raw == '*');
154 }
155 
156 /// Forward range of extended CIGAR operations, with =/X instead of M
157 /// Useful for, e.g., detecting positions of mismatches.
158 struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
159     static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
160     static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
161 
162     private {
163         CigarOpRange _cigar;
164         MdOpRange _md_ops;
165         CigarOperation _front_cigar_op;
166         MdOperation _front_md_op;
167         uint _n_mismatches;
168         bool _empty;
169     }
170 
171     ///
172     this(CigarOpRange cigar, MdOpRange md_ops) {
173         _cigar = cigar;
174         _md_ops = md_ops;
175         fetchNextCigarOp();
176         fetchNextMdOp();
177     }
178 
179     /// Forward range primitives
180     bool empty() @property const {
181         return _empty;
182     }
183 
184     /// ditto
185     CigarOperation front() @property {
186         debug {
187             import std.stdio;
188             writeln(_front_cigar_op, " - ", _front_md_op);
189         }
190 
191         if (_front_cigar_op.type != 'M')
192             return _front_cigar_op;
193 
194         if (_n_mismatches == 0) {
195             assert(_front_md_op.is_match);
196             uint len = min(_front_md_op.match, _front_cigar_op.length);
197             return CigarOperation(len, '=');
198         }
199 
200         assert(_front_md_op.is_mismatch);
201         return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
202     }
203 
204     /// ditto
205     ExtendedCigarRange save() @property {
206         typeof(return) r = void;
207         r._cigar = _cigar.save;
208         r._md_ops = _md_ops.save;
209         r._front_cigar_op = _front_cigar_op;
210         r._front_md_op = _front_md_op;
211         r._n_mismatches = _n_mismatches;
212         r._empty = _empty;
213         return r;
214     }
215 
216     /// ditto
217     void popFront() {
218         if (!_front_cigar_op.is_match_or_mismatch) {
219             if (_front_cigar_op.is_reference_consuming)
220                 fetchNextMdOp();
221             fetchNextCigarOp();
222             return;
223         }
224 
225         auto len = _front_cigar_op.length;
226         if (_n_mismatches > 0) {
227             enforce(_front_md_op.is_mismatch);
228 
229             if (len > _n_mismatches) {
230                 _front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
231                 _n_mismatches = 0;
232                 fetchNextMdOp();
233             } else if (len < _n_mismatches) {
234                 _n_mismatches -= len;
235                 fetchNextCigarOp();
236             } else {
237                 fetchNextCigarOp();
238                 fetchNextMdOp();
239             }
240         } else {
241             enforce(_front_md_op.is_match);
242             auto n_matches = _front_md_op.match;
243 
244             if (len > n_matches) {
245                 _front_cigar_op = CigarOperation(len - n_matches, 'M');
246                 fetchNextMdOp();
247             } else if (len < n_matches) {
248                 _front_md_op.match -= len;
249                 fetchNextCigarOp();
250             } else {
251                 fetchNextCigarOp();
252                 fetchNextMdOp();
253             }
254         }
255     }
256 
257     private {
258         void fetchNextCigarOp() {
259             if (_cigar.empty) {
260                 _empty = true;
261                 return;
262             }
263 
264             _front_cigar_op = _cigar.front;
265             _cigar.popFront();
266         }
267 
268         void fetchNextMdOp() {
269             if (_md_ops.empty)
270                 return;
271 
272             _n_mismatches = 0;
273 
274             _front_md_op = _md_ops.front;
275             _md_ops.popFront();
276 
277             if (_front_md_op.is_mismatch) {
278                 _n_mismatches = 1;
279                 while (!_md_ops.empty && _md_ops.front.is_mismatch) {
280                     _md_ops.popFront();
281                     _n_mismatches += 1;
282                 }
283             }
284         }
285     }
286 }
287 
288 auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
289     return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
290 }