1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 25 module bio.std.hts.bam.cigar; 26 27 import std.algorithm; 28 import std.range; 29 import std.conv; 30 import std.format; 31 import std.exception; 32 import std.system; 33 import std.traits; 34 import std.array; 35 import std.bitmanip; 36 import core.stdc.stdlib; 37 38 import bio.core.base; 39 import bio.core.utils.format; 40 41 import bio.std.hts.bam.abstractreader; 42 43 import bio.std.hts.bam.writer; 44 import bio.std.hts.bam.tagvalue; 45 import bio.std.hts.bam.bai.bin; 46 47 import bio.std.hts.bam.md.core; 48 49 import bio.std.hts.utils.array; 50 import bio.std.hts.utils.value; 51 import bio.core.utils.switchendianness; 52 53 import bio.std.hts.thirdparty.msgpack : Packer, unpack; 54 55 /** 56 Represents single CIGAR operation 57 */ 58 struct CigarOperation { 59 static assert(CigarOperation.sizeof == uint.sizeof); 60 /* 61 WARNING! 62 63 It is very essential that the size of 64 this struct is EXACTLY equal to uint.sizeof! 65 66 The reason is to avoid copying of arrays during alignment parsing. 67 68 Namely, when some_pointer points to raw cigar data, 69 we can just do a cast. This allows to access those data 70 directly, not doing any memory allocations. 71 */ 72 73 private uint raw; // raw data from BAM 74 75 private static ubyte char2op(char c) { 76 switch(c) { 77 case 'M': return 0; 78 case 'I': return 1; 79 case 'D': return 2; 80 case 'N': return 3; 81 case 'S': return 4; 82 case 'H': return 5; 83 case 'P': return 6; 84 case '=': return 7; 85 case 'X': return 8; 86 default: return 15; // 15 is used as invalid value 87 } 88 } 89 90 /// Length must be strictly less than 2^28. 91 /// $(BR) 92 /// Operation type must be one of M, I, D, N, S, H, P, =, X. 93 this(uint length, char operation_type) { 94 enforce(length < (1<<28), "Too big length of CIGAR operation"); 95 raw = (length << 4) | char2op(operation_type); 96 } 97 98 this(uint _raw) { 99 raw = _raw; 100 } 101 102 /// Operation length 103 uint length() @property const nothrow @nogc { 104 return raw >> 4; 105 } 106 107 /// CIGAR operation as one of MIDNSHP=X. 108 /// Absent or invalid operation is represented by '?' 109 char type() @property const nothrow @nogc { 110 return "MIDNSHP=X????????"[raw & 0xF]; 111 } 112 113 // Each pair of bits has first bit set iff the operation is query consuming, 114 // and second bit set iff it is reference consuming. 115 // X = P H S N D I M 116 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 117 118 /// True iff operation is one of M, =, X, I, S 119 bool is_query_consuming() @property const nothrow @nogc { 120 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; 121 } 122 123 /// True iff operation is one of M, =, X, D, N 124 bool is_reference_consuming() @property const nothrow @nogc { 125 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; 126 } 127 128 /// True iff operation is one of M, =, X 129 bool is_match_or_mismatch() @property const nothrow @nogc { 130 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; 131 } 132 133 /// True iff operation is one of 'S', 'H' 134 bool is_clipping() @property const nothrow @nogc { 135 return ((raw & 0xF) >> 1) == 2; // 4 or 5 136 } 137 138 void toSam(Sink)(auto ref Sink sink) const 139 if (isSomeSink!Sink) 140 { 141 sink.write(length); 142 sink.write(type); 143 } 144 145 void toString(scope void delegate(const(char)[]) sink) const { 146 toSam(sink); 147 } 148 } 149 150 alias CigarOperation[] CigarOperations; 151 152 bool is_unavailable(CigarOperations cigar) @property nothrow @nogc { 153 return (cigar.length == 1 && cigar[0].raw == '*'); 154 } 155 156 /// Forward range of extended CIGAR operations, with =/X instead of M 157 /// Useful for, e.g., detecting positions of mismatches. 158 struct ExtendedCigarRange(CigarOpRange, MdOpRange) { 159 static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation)); 160 static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation)); 161 162 private { 163 CigarOpRange _cigar; 164 MdOpRange _md_ops; 165 CigarOperation _front_cigar_op; 166 MdOperation _front_md_op; 167 uint _n_mismatches; 168 bool _empty; 169 } 170 171 /// 172 this(CigarOpRange cigar, MdOpRange md_ops) { 173 _cigar = cigar; 174 _md_ops = md_ops; 175 fetchNextCigarOp(); 176 fetchNextMdOp(); 177 } 178 179 /// Forward range primitives 180 bool empty() @property const { 181 return _empty; 182 } 183 184 /// ditto 185 CigarOperation front() @property { 186 debug { 187 import std.stdio; 188 writeln(_front_cigar_op, " - ", _front_md_op); 189 } 190 191 if (_front_cigar_op.type != 'M') 192 return _front_cigar_op; 193 194 if (_n_mismatches == 0) { 195 assert(_front_md_op.is_match); 196 uint len = min(_front_md_op.match, _front_cigar_op.length); 197 return CigarOperation(len, '='); 198 } 199 200 assert(_front_md_op.is_mismatch); 201 return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X'); 202 } 203 204 /// ditto 205 ExtendedCigarRange save() @property { 206 typeof(return) r = void; 207 r._cigar = _cigar.save; 208 r._md_ops = _md_ops.save; 209 r._front_cigar_op = _front_cigar_op; 210 r._front_md_op = _front_md_op; 211 r._n_mismatches = _n_mismatches; 212 r._empty = _empty; 213 return r; 214 } 215 216 /// ditto 217 void popFront() { 218 if (!_front_cigar_op.is_match_or_mismatch) { 219 if (_front_cigar_op.is_reference_consuming) 220 fetchNextMdOp(); 221 fetchNextCigarOp(); 222 return; 223 } 224 225 auto len = _front_cigar_op.length; 226 if (_n_mismatches > 0) { 227 enforce(_front_md_op.is_mismatch); 228 229 if (len > _n_mismatches) { 230 _front_cigar_op = CigarOperation(len - _n_mismatches, 'M'); 231 _n_mismatches = 0; 232 fetchNextMdOp(); 233 } else if (len < _n_mismatches) { 234 _n_mismatches -= len; 235 fetchNextCigarOp(); 236 } else { 237 fetchNextCigarOp(); 238 fetchNextMdOp(); 239 } 240 } else { 241 enforce(_front_md_op.is_match); 242 auto n_matches = _front_md_op.match; 243 244 if (len > n_matches) { 245 _front_cigar_op = CigarOperation(len - n_matches, 'M'); 246 fetchNextMdOp(); 247 } else if (len < n_matches) { 248 _front_md_op.match -= len; 249 fetchNextCigarOp(); 250 } else { 251 fetchNextCigarOp(); 252 fetchNextMdOp(); 253 } 254 } 255 } 256 257 private { 258 void fetchNextCigarOp() { 259 if (_cigar.empty) { 260 _empty = true; 261 return; 262 } 263 264 _front_cigar_op = _cigar.front; 265 _cigar.popFront(); 266 } 267 268 void fetchNextMdOp() { 269 if (_md_ops.empty) 270 return; 271 272 _n_mismatches = 0; 273 274 _front_md_op = _md_ops.front; 275 _md_ops.popFront(); 276 277 if (_front_md_op.is_mismatch) { 278 _n_mismatches = 1; 279 while (!_md_ops.empty && _md_ops.front.is_mismatch) { 280 _md_ops.popFront(); 281 _n_mismatches += 1; 282 } 283 } 284 } 285 } 286 } 287 288 auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) { 289 return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops); 290 }