1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11 
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14 
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.std.hts.bam.md.reconstruct;
25 
26 import bio.std.hts.bam.cigar;
27 import bio.std.hts.bam.read;
28 import bio.std.hts.bam.md.core;
29 
30 import std.conv;
31 import std.range;
32 import std.traits;
33 import std.algorithm;
34 import std.range;
35 
36 /// Reconstruct read DNA.
37 /// Returns lazy sequence.
38 auto dna(T)(T read)
39     if(isBamRead!(Unqual!T))
40 {
41 
42     debug {
43         /*
44         import std.stdio;
45         stderr.writeln("[dna] processing read ", read.name);
46         stderr.flush();
47         */
48     }
49 
50     static struct QueryChunk(S) {
51         S sequence;
52         CigarOperation operation;
53     }
54 
55     static struct QueryChunksResult(R, S) {
56         this(R ops, S seq) {
57             _seq = seq;
58             _ops = ops;
59         }
60 
61         auto front() @property {
62             auto op = _ops.front;
63             return QueryChunk!S(_seq[0 .. op.length], op);
64         }
65 
66         bool empty() @property {
67             return _ops.empty;
68         }
69 
70         void popFront() {
71             _seq = _seq[_ops.front.length .. _seq.length];
72             _ops.popFront();
73         }
74 
75         private R _ops;
76         private S _seq;
77     }
78 
79     static auto getQueryChunksResult(R, S)(S sequence, R cigar) {
80         return QueryChunksResult!(R, S)(cigar, sequence);
81     }
82 
83     // Get read sequence chunks corresponding to query-consuming operations in read.sequence
84     static auto queryChunks(ref T read) {
85 
86 
87         return getQueryChunksResult(read.sequence, filter!"a.is_query_consuming"(read.cigar));
88     }
89 
90     auto _read = read;
91 
92     auto query_chunks = queryChunks(_read);
93 
94     static struct Result(R, M) {
95         this(ref T read, R query_sequence, M md_operations) {
96             debug {
97                 _initial_qseq = to!string(query_sequence);
98             }
99             _qseq = query_sequence;
100             _md = md_operations;
101             _fetchNextMdOperation();
102         }
103 
104         bool empty() @property {
105             return _empty;
106         }
107 
108         /*
109         MD operations -> match(N)    ? consume N characters from query
110                          mismatch(C) ? consume a character from query and replace it with C
111                          deletion(S) ? consume S from MD
112         */
113 
114         char front() @property {
115             final switch (_cur_md_op.type) {
116                 case MdOperationType.Match:
117                     return cast(char)_qseq.front;
118                 case MdOperationType.Mismatch:
119                     return _cur_md_op.mismatch;
120                 case MdOperationType.Deletion:
121                     return cast(char)_cur_md_op.deletion.front;
122             }
123         }
124 
125         private void _fetchNextMdOperation() {
126             if (_md.empty) {
127                 _empty = true;
128                 return;
129             }
130             _cur_md_op = _md.front;
131             _md.popFront();
132         }
133 
134         private bool _qseqIsSuddenlyEmpty() {
135             if (!_qseq.empty) {
136                 return false;
137             }
138 
139             /* MD and CIGAR don't match */
140             debug {
141                 import std.stdio;
142                 stderr.writeln("Current MD operation: ", _cur_md_op);
143                 stderr.writeln("Query sequence: ", _initial_qseq);
144             }
145 
146             return true;
147         }
148 
149         void popFront() {
150             final switch (_cur_md_op.type) {
151                 case MdOperationType.Mismatch:
152                     if (_qseqIsSuddenlyEmpty())
153                         break;
154                     _qseq.popFront();
155                     _fetchNextMdOperation();
156                     break;
157                 case MdOperationType.Match:
158                     if (_qseqIsSuddenlyEmpty())
159                         break;
160                     --_cur_md_op.match;
161                     _qseq.popFront();
162                     if (_cur_md_op.match == 0) {
163                         _fetchNextMdOperation();
164                     }
165                     break;
166                 case MdOperationType.Deletion:
167                     _cur_md_op.deletion.popFront();
168                     if (_cur_md_op.deletion.empty) {
169                         _fetchNextMdOperation();
170                     }
171                     break;
172             }
173         }
174 
175         private {
176             debug {
177                 string _initial_qseq;
178             }
179             R _qseq;
180             M _md;
181 
182             bool _empty;
183             MdOperation _cur_md_op;
184         }
185     }
186 
187     auto md = _read["MD"];
188     string md_str;
189     if (!md.is_nothing) {
190         md_str = cast(string)_read["MD"];
191     }
192 
193     static auto getResult(R, M)(ref T read, R query, M md_ops) {
194         return Result!(R, M)(read, query, md_ops);
195     }
196 
197     auto result =  getResult(_read,
198                              joiner(map!"a.sequence"(filter!"a.operation.is_reference_consuming"(query_chunks))),
199                              mdOperations(md_str));
200 
201     debug {
202         import std.stdio;
203         if (result.empty) {
204             stderr.writeln("[dna] empty DNA!");
205             stderr.writeln("      read name: ", read.name);
206             stderr.writeln("      read sequence: ", read.sequence);
207             stderr.writeln("      read CIGAR: ", read.cigarString());
208             stderr.writeln("      read MD tag: ", read["MD"]);
209             stderr.flush();
210         }
211     }
212 
213     return result;
214 }
215 
216 unittest {
217 
218     import std.stdio;
219     // stderr.writeln("Testing reconstruction of reference from MD tags and CIGAR");
220 
221     // Test reference reconstruction from MD and CIGAR.
222     // (Tests are taken from http://davetang.org/muse/2011/01/28/perl-and-sam/)
223 
224     BamRead read;
225 
226     read = BamRead("r1",
227                    "CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG",
228                    [CigarOperation(36, 'M')]);
229     read["MD"] = "1A0C0C0C1T0C0T27";
230 
231     assert(equal(dna(read), "CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG"));
232 
233     read = BamRead("r2",
234                    "GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT",
235                    [CigarOperation(6, 'M'),
236                     CigarOperation(1, 'I'),
237                     CigarOperation(29, 'M')]);
238     read["MD"] = "0C1C0C1C0T0C27";
239 
240     assert(equal(dna(read), "CACCCCTCTGACATCCGGCCTGCTCCTTCTCACAT"));
241 
242     read = BamRead("r3",
243                    "AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC",
244                    [CigarOperation(9, 'M'),
245                     CigarOperation(9, 'D'),
246                     CigarOperation(27, 'M')]);
247     read["MD"] = "2G0A5^ATGATGTCA27";
248     assert(equal(dna(read), "AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC"));
249 
250     read = BamRead("r4",
251                    "AGTGATGGGAGGATGTCTCGTCTGTGAGTTACAGCA",
252                    [CigarOperation(2, 'M'),
253                     CigarOperation(1, 'I'),
254                     CigarOperation(7, 'M'),
255                     CigarOperation(6, 'D'),
256                     CigarOperation(26, 'M')]);
257     read["MD"] = "3C3T1^GCTCAG26";
258     assert(equal(dna(read), "AGGCTGGTAGCTCAGGGATGTCTCGTCTGTGAGTTACAGCA"));
259 
260 }
261 
262 /**
263  * Returns lazy sequence of reference bases. If some bases can't be determined from reads,
264  * they are replaced with 'N'.
265  *
266  * Reads must be a range of reads aligned to the same reference sequence, sorted by leftmost
267  * coordinate.
268  * Returned reference bases start from the leftmost position of the first read,
269  * and end at the rightmost position of all the reads.
270  */
271 auto dna(R)(R reads)
272     if (isInputRange!R && isBamRead!(Unqual!(ElementType!R)))
273 {
274     static struct Result(F) {
275         alias Unqual!(ElementType!F) Read;
276 
277         this(F reads) {
278             _reads = reads;
279             if (_reads.empty) {
280                 _empty = true;
281                 return;
282             }
283             auto read = _reads.front;
284             _chunk = dna(read);
285             _reference_pos = read.position;
286             _reads.popFront();
287         }
288 
289         @property bool empty() {
290             return _empty;
291         }
292 
293         @property char front() {
294             if (_bases_to_skip > 0) {
295                 return 'N';
296             }
297             return _chunk.front;
298         }
299 
300         private void setSkipMode(ref Read read) {
301             _reads.popFront();
302             _chunk = dna(read);
303             _bases_to_skip = read.position - _reference_pos;
304         }
305 
306         void popFront() {
307             _reference_pos += 1;
308 
309             if (_bases_to_skip > 0) {
310                 --_bases_to_skip;
311                 return;
312             }
313 
314             _chunk.popFront();
315 
316             /*
317              * If current chunk is empty, get the next one.
318              *
319              * Here's the reference:
320              * .........................*.......................................
321              *                          _reference_pos (we are here)
322              * Last chunk ended just now:
323              *              [..........]
324              * Go through subsequent reads while their leftmost position is
325              * less or equal to _reference_pos, select the one which covers
326              * more bases to the right of _reference_pos.
327              *               [...............]
328              *                [....]
329              *                  [..........]
330              *                        [.........]  <- this one is the best
331              */
332             if (_chunk.empty) {
333                 if (_reads.empty) {
334                     _empty = true;
335                     return;
336                 }
337                 auto next_read = _reads.front;
338                 if (next_read.position > _reference_pos) {
339                     setSkipMode(next_read);
340                     return;
341                 }
342                 auto best_read = next_read;
343                 // read covers half-open [position .. position + basesCovered) interval
344                 auto best_end_pos = best_read.basesCovered() + best_read.position;
345                 bool found_good = best_end_pos > _reference_pos;
346                 while (true) {
347                     if (_reads.empty) {
348                         if (!found_good) {
349                             _empty = true;
350                             return;
351                         }
352                         break;
353                     }
354 
355                     auto read = _reads.front;
356 
357                     if (read.position > _reference_pos) {
358                         if (!found_good) {
359                             setSkipMode(read);
360                             return;
361                         }
362                         break;
363                     }
364 
365                     auto end_pos = read.basesCovered() + read.position;
366                     if (end_pos > _reference_pos) {
367                         found_good = true;
368                         if (end_pos > best_end_pos) {
369                             best_end_pos = end_pos;
370                             best_read = read;
371                         }
372                     }
373                     _reads.popFront();
374                 }
375 
376                 // If we're here, we've found a good read.
377                 _chunk = dna(best_read);
378                 debug {
379                     /*
380                     import std.stdio;
381                     writeln("_reference_pos = ", _reference_pos,
382                             "; best_read.position = ", best_read.position,
383                             "; _chunk length = ", best_read.basesCovered());
384                             */
385                 }
386                 // However, we need to strip some bases from the left.
387                 popFrontN(_chunk, _reference_pos - best_read.position);
388             }
389         }
390 
391         private size_t _bases_to_skip;
392         private size_t _reference_pos;
393         private ReturnType!(dna!Read) _chunk;
394         private bool _empty = false;
395         private F _reads;
396     }
397 
398     auto nonempty = filter!"a.basesCovered() > 0"(reads);
399     return Result!(typeof(nonempty))(nonempty);
400 }
401 
402 unittest {
403 
404     // reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam
405 
406     auto r1 = BamRead("r1",
407                       "AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTA",
408                       [CigarOperation(89, 'M'),
409                        CigarOperation(1, 'S')]);
410     r1.position = 60246;
411     r1["MD"] = "89";
412 
413     auto r2 = BamRead("r2",
414                       "TGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCCAGCCCTT",
415                       [CigarOperation(83, 'M'),
416                        CigarOperation(7, 'S')]);
417     r2.position = 60252;
418     r2["MD"] = "82T0";
419 
420     auto r3 = BamRead("r3",
421                       "CATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCT",
422                       [CigarOperation(90, 'M')]);
423     r3.position = 60283;
424     r3["MD"] = "90";
425 
426     auto r4 = BamRead("r4",
427                       "CCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCTAACTTCCTCATGGGACCTGGGTGTGTCTGATCTGTGCACACCACTATCCAACCG",
428                       [CigarOperation(90, 'M')]);
429     r4.position = 60337;
430     r4["MD"] = "90";
431 
432     auto r5 = BamRead("r5",
433                       "GAGGCTCCACCCTGGCCACTCTTGTGTGCACACAGCACAGCCTCTACTGCTACACCTGAGTACTTTGCCAGTGGCCTGGAAGCACTTTGT",
434                       [CigarOperation(90, 'M')]);
435     r5.position = 60432;
436     r5["MD"] = "90";
437 
438     auto reads = [r1, r2, r3, r4, r5];
439     assert(equal(dna(reads), "AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCTAACTTCCTCATGGGACCTGGGTGTGTCTGATCTGTGCACACCACTATCCAACCGNNNNNGAGGCTCCACCCTGGCCACTCTTGTGTGCACACAGCACAGCCTCTACTGCTACACCTGAGTACTTTGCCAGTGGCCTGGAAGCACTTTGT"));
440 }