1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.std.hts.bam.md.reconstruct; 25 26 import bio.std.hts.bam.cigar; 27 import bio.std.hts.bam.read; 28 import bio.std.hts.bam.md.core; 29 30 import std.conv; 31 import std.range; 32 import std.traits; 33 import std.algorithm; 34 import std.range; 35 36 /// Reconstruct read DNA. 37 /// Returns lazy sequence. 38 auto dna(T)(T read) 39 if(isBamRead!(Unqual!T)) 40 { 41 42 debug { 43 /* 44 import std.stdio; 45 stderr.writeln("[dna] processing read ", read.name); 46 stderr.flush(); 47 */ 48 } 49 50 static struct QueryChunk(S) { 51 S sequence; 52 CigarOperation operation; 53 } 54 55 static struct QueryChunksResult(R, S) { 56 this(R ops, S seq) { 57 _seq = seq; 58 _ops = ops; 59 } 60 61 auto front() @property { 62 auto op = _ops.front; 63 return QueryChunk!S(_seq[0 .. op.length], op); 64 } 65 66 bool empty() @property { 67 return _ops.empty; 68 } 69 70 void popFront() { 71 _seq = _seq[_ops.front.length .. _seq.length]; 72 _ops.popFront(); 73 } 74 75 private R _ops; 76 private S _seq; 77 } 78 79 static auto getQueryChunksResult(R, S)(S sequence, R cigar) { 80 return QueryChunksResult!(R, S)(cigar, sequence); 81 } 82 83 // Get read sequence chunks corresponding to query-consuming operations in read.sequence 84 static auto queryChunks(ref T read) { 85 86 87 return getQueryChunksResult(read.sequence, filter!"a.is_query_consuming"(read.cigar)); 88 } 89 90 auto _read = read; 91 92 auto query_chunks = queryChunks(_read); 93 94 static struct Result(R, M) { 95 this(ref T read, R query_sequence, M md_operations) { 96 debug { 97 _initial_qseq = to!string(query_sequence); 98 } 99 _qseq = query_sequence; 100 _md = md_operations; 101 _fetchNextMdOperation(); 102 } 103 104 bool empty() @property { 105 return _empty; 106 } 107 108 /* 109 MD operations -> match(N) ? consume N characters from query 110 mismatch(C) ? consume a character from query and replace it with C 111 deletion(S) ? consume S from MD 112 */ 113 114 char front() @property { 115 final switch (_cur_md_op.type) { 116 case MdOperationType.Match: 117 return cast(char)_qseq.front; 118 case MdOperationType.Mismatch: 119 return _cur_md_op.mismatch; 120 case MdOperationType.Deletion: 121 return cast(char)_cur_md_op.deletion.front; 122 } 123 } 124 125 private void _fetchNextMdOperation() { 126 if (_md.empty) { 127 _empty = true; 128 return; 129 } 130 _cur_md_op = _md.front; 131 _md.popFront(); 132 } 133 134 private bool _qseqIsSuddenlyEmpty() { 135 if (!_qseq.empty) { 136 return false; 137 } 138 139 /* MD and CIGAR don't match */ 140 debug { 141 import std.stdio; 142 stderr.writeln("Current MD operation: ", _cur_md_op); 143 stderr.writeln("Query sequence: ", _initial_qseq); 144 } 145 146 return true; 147 } 148 149 void popFront() { 150 final switch (_cur_md_op.type) { 151 case MdOperationType.Mismatch: 152 if (_qseqIsSuddenlyEmpty()) 153 break; 154 _qseq.popFront(); 155 _fetchNextMdOperation(); 156 break; 157 case MdOperationType.Match: 158 if (_qseqIsSuddenlyEmpty()) 159 break; 160 --_cur_md_op.match; 161 _qseq.popFront(); 162 if (_cur_md_op.match == 0) { 163 _fetchNextMdOperation(); 164 } 165 break; 166 case MdOperationType.Deletion: 167 _cur_md_op.deletion.popFront(); 168 if (_cur_md_op.deletion.empty) { 169 _fetchNextMdOperation(); 170 } 171 break; 172 } 173 } 174 175 private { 176 debug { 177 string _initial_qseq; 178 } 179 R _qseq; 180 M _md; 181 182 bool _empty; 183 MdOperation _cur_md_op; 184 } 185 } 186 187 auto md = _read["MD"]; 188 string md_str; 189 if (!md.is_nothing) { 190 md_str = cast(string)_read["MD"]; 191 } 192 193 static auto getResult(R, M)(ref T read, R query, M md_ops) { 194 return Result!(R, M)(read, query, md_ops); 195 } 196 197 auto result = getResult(_read, 198 joiner(map!"a.sequence"(filter!"a.operation.is_reference_consuming"(query_chunks))), 199 mdOperations(md_str)); 200 201 debug { 202 import std.stdio; 203 if (result.empty) { 204 stderr.writeln("[dna] empty DNA!"); 205 stderr.writeln(" read name: ", read.name); 206 stderr.writeln(" read sequence: ", read.sequence); 207 stderr.writeln(" read CIGAR: ", read.cigarString()); 208 stderr.writeln(" read MD tag: ", read["MD"]); 209 stderr.flush(); 210 } 211 } 212 213 return result; 214 } 215 216 unittest { 217 218 import std.stdio; 219 // stderr.writeln("Testing reconstruction of reference from MD tags and CIGAR"); 220 221 // Test reference reconstruction from MD and CIGAR. 222 // (Tests are taken from http://davetang.org/muse/2011/01/28/perl-and-sam/) 223 224 BamRead read; 225 226 read = BamRead("r1", 227 "CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG", 228 [CigarOperation(36, 'M')]); 229 read["MD"] = "1A0C0C0C1T0C0T27"; 230 231 assert(equal(dna(read), "CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG")); 232 233 read = BamRead("r2", 234 "GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT", 235 [CigarOperation(6, 'M'), 236 CigarOperation(1, 'I'), 237 CigarOperation(29, 'M')]); 238 read["MD"] = "0C1C0C1C0T0C27"; 239 240 assert(equal(dna(read), "CACCCCTCTGACATCCGGCCTGCTCCTTCTCACAT")); 241 242 read = BamRead("r3", 243 "AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC", 244 [CigarOperation(9, 'M'), 245 CigarOperation(9, 'D'), 246 CigarOperation(27, 'M')]); 247 read["MD"] = "2G0A5^ATGATGTCA27"; 248 assert(equal(dna(read), "AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC")); 249 250 read = BamRead("r4", 251 "AGTGATGGGAGGATGTCTCGTCTGTGAGTTACAGCA", 252 [CigarOperation(2, 'M'), 253 CigarOperation(1, 'I'), 254 CigarOperation(7, 'M'), 255 CigarOperation(6, 'D'), 256 CigarOperation(26, 'M')]); 257 read["MD"] = "3C3T1^GCTCAG26"; 258 assert(equal(dna(read), "AGGCTGGTAGCTCAGGGATGTCTCGTCTGTGAGTTACAGCA")); 259 260 } 261 262 /** 263 * Returns lazy sequence of reference bases. If some bases can't be determined from reads, 264 * they are replaced with 'N'. 265 * 266 * Reads must be a range of reads aligned to the same reference sequence, sorted by leftmost 267 * coordinate. 268 * Returned reference bases start from the leftmost position of the first read, 269 * and end at the rightmost position of all the reads. 270 */ 271 auto dna(R)(R reads) 272 if (isInputRange!R && isBamRead!(Unqual!(ElementType!R))) 273 { 274 static struct Result(F) { 275 alias Unqual!(ElementType!F) Read; 276 277 this(F reads) { 278 _reads = reads; 279 if (_reads.empty) { 280 _empty = true; 281 return; 282 } 283 auto read = _reads.front; 284 _chunk = dna(read); 285 _reference_pos = read.position; 286 _reads.popFront(); 287 } 288 289 @property bool empty() { 290 return _empty; 291 } 292 293 @property char front() { 294 if (_bases_to_skip > 0) { 295 return 'N'; 296 } 297 return _chunk.front; 298 } 299 300 private void setSkipMode(ref Read read) { 301 _reads.popFront(); 302 _chunk = dna(read); 303 _bases_to_skip = read.position - _reference_pos; 304 } 305 306 void popFront() { 307 _reference_pos += 1; 308 309 if (_bases_to_skip > 0) { 310 --_bases_to_skip; 311 return; 312 } 313 314 _chunk.popFront(); 315 316 /* 317 * If current chunk is empty, get the next one. 318 * 319 * Here's the reference: 320 * .........................*....................................... 321 * _reference_pos (we are here) 322 * Last chunk ended just now: 323 * [..........] 324 * Go through subsequent reads while their leftmost position is 325 * less or equal to _reference_pos, select the one which covers 326 * more bases to the right of _reference_pos. 327 * [...............] 328 * [....] 329 * [..........] 330 * [.........] <- this one is the best 331 */ 332 if (_chunk.empty) { 333 if (_reads.empty) { 334 _empty = true; 335 return; 336 } 337 auto next_read = _reads.front; 338 if (next_read.position > _reference_pos) { 339 setSkipMode(next_read); 340 return; 341 } 342 auto best_read = next_read; 343 // read covers half-open [position .. position + basesCovered) interval 344 auto best_end_pos = best_read.basesCovered() + best_read.position; 345 bool found_good = best_end_pos > _reference_pos; 346 while (true) { 347 if (_reads.empty) { 348 if (!found_good) { 349 _empty = true; 350 return; 351 } 352 break; 353 } 354 355 auto read = _reads.front; 356 357 if (read.position > _reference_pos) { 358 if (!found_good) { 359 setSkipMode(read); 360 return; 361 } 362 break; 363 } 364 365 auto end_pos = read.basesCovered() + read.position; 366 if (end_pos > _reference_pos) { 367 found_good = true; 368 if (end_pos > best_end_pos) { 369 best_end_pos = end_pos; 370 best_read = read; 371 } 372 } 373 _reads.popFront(); 374 } 375 376 // If we're here, we've found a good read. 377 _chunk = dna(best_read); 378 debug { 379 /* 380 import std.stdio; 381 writeln("_reference_pos = ", _reference_pos, 382 "; best_read.position = ", best_read.position, 383 "; _chunk length = ", best_read.basesCovered()); 384 */ 385 } 386 // However, we need to strip some bases from the left. 387 popFrontN(_chunk, _reference_pos - best_read.position); 388 } 389 } 390 391 private size_t _bases_to_skip; 392 private size_t _reference_pos; 393 private ReturnType!(dna!Read) _chunk; 394 private bool _empty = false; 395 private F _reads; 396 } 397 398 auto nonempty = filter!"a.basesCovered() > 0"(reads); 399 return Result!(typeof(nonempty))(nonempty); 400 } 401 402 unittest { 403 404 // reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam 405 406 auto r1 = BamRead("r1", 407 "AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTA", 408 [CigarOperation(89, 'M'), 409 CigarOperation(1, 'S')]); 410 r1.position = 60246; 411 r1["MD"] = "89"; 412 413 auto r2 = BamRead("r2", 414 "TGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCCAGCCCTT", 415 [CigarOperation(83, 'M'), 416 CigarOperation(7, 'S')]); 417 r2.position = 60252; 418 r2["MD"] = "82T0"; 419 420 auto r3 = BamRead("r3", 421 "CATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCT", 422 [CigarOperation(90, 'M')]); 423 r3.position = 60283; 424 r3["MD"] = "90"; 425 426 auto r4 = BamRead("r4", 427 "CCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCTAACTTCCTCATGGGACCTGGGTGTGTCTGATCTGTGCACACCACTATCCAACCG", 428 [CigarOperation(90, 'M')]); 429 r4.position = 60337; 430 r4["MD"] = "90"; 431 432 auto r5 = BamRead("r5", 433 "GAGGCTCCACCCTGGCCACTCTTGTGTGCACACAGCACAGCCTCTACTGCTACACCTGAGTACTTTGCCAGTGGCCTGGAAGCACTTTGT", 434 [CigarOperation(90, 'M')]); 435 r5.position = 60432; 436 r5["MD"] = "90"; 437 438 auto reads = [r1, r2, r3, r4, r5]; 439 assert(equal(dna(reads), "AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCTAACTTCCTCATGGGACCTGGGTGTGTCTGATCTGTGCACACCACTATCCAACCGNNNNNGAGGCTCCACCCTGGCCACTCTTGTGTGCACACAGCACAGCCTCTACTGCTACACCTGAGTACTTTGCCAGTGGCCTGGAAGCACTTTGT")); 440 }