1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.bam.iontorrent.flowcall;
25 
26 import bio.bam.tagvalue;
27 import bio.bam.iontorrent.flowindex;
28 
29 import bio.core.base;
30 import bio.core.utils.range;
31 
32 import std.array;
33 import std.typecons;
34 import std.range;
35 import std.algorithm;
36 import std.exception;
37 
38 /// Tag where flow signal intensities are stored
39 enum FlowGramTag : ubyte {
40     FZ,
41     ZM
42 }
43 
44 /// Scale of intensity values
45 float multiplier(FlowGramTag tag) {
46     return tag == FlowGramTag.FZ ? 100.0 : 256.0;
47 }
48 
49 /// Flow base call
50 struct FlowCall {
51     private {
52         short _signal_intensity;
53 
54         static assert(Base.ValueSetSize <= 16 && FlowGramTag.max < 16,
55                       "implementation of FlowCall should be changed?");
56 
57         ubyte _storage;  // tag in upper 4 bits, base in lower 4 bits
58 
59         Base _base() @property const {
60             return Base.fromInternalCode(_storage & 0xF);
61         }
62 
63         void _base(Base b) @property {
64             _storage &= 0xF0;
65             _storage |= b.internal_code;
66         }
67 
68         FlowGramTag _tag() @property const {
69             return cast(FlowGramTag)(_storage >> 4);
70         }
71 
72         void _tag(FlowGramTag tag) @property {
73             _storage &= 0xF;
74             _storage |= (cast(ubyte)tag << 4);
75         }
76 
77         this(short signal_intensity, Base b, FlowGramTag tag) {
78             _signal_intensity = signal_intensity;
79             _storage = cast(ubyte)(b.internal_code | (tag << 4));
80         }
81     }
82 
83     /// Nucleotide
84     Base base() @property const {
85         return _base;
86     }
87 
88     /// Signal intensity, normalized to homopolymer lengths
89     float intensity() @property const {
90         return _signal_intensity / multiplier(_tag);
91     }
92 
93     /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tag,
94     /// and 256.0 for ZM tag.
95     /// More efficient, because this is how intensities are stored in FZ/ZM tag.
96     short intensity_value() @property const {
97         return _signal_intensity;
98     }
99 }
100 
101 /// Flow call associated with a read
102 struct ReadFlowCall {
103     private {
104         FlowCall _fc;
105         ushort _offset;
106         ushort _called_len;
107         ushort _flow_index;
108 
109         this(Base b, short signal_intensity, ushort offset,
110                      ushort called, ushort flow_index, FlowGramTag tag)
111         {
112             _fc = FlowCall(signal_intensity, b, tag);
113             _offset = offset;
114             _called_len = called;
115             _flow_index = flow_index;
116         }
117     }
118 
119     /// Called nucleotide
120     Base base() @property const {
121         return _fc._base;
122     }
123 
124     /// Set base to its complement
125     void complement() {
126         _fc._base = _fc._base.complement;
127     }
128 
129     /// Called homopolymer length
130     ushort length() @property const {
131         return _called_len;
132     }
133 
134     /// Zero-based position of the first nucleotide in the run,      
135     /// relative to start of the read. Takes strandness into account.
136     ushort offset() @property const {
137         return _offset;
138     }
139 
140     /// Signal intensity, normalized to homopolymer lengths
141     float intensity() @property const {
142         return _fc.intensity;
143     }
144 
145     /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tags,
146     /// and 256.0 for ZM tags.
147     /// More efficient, because this is how intensities are stored in FZ/ZM tag.
148     short intensity_value() @property const {
149         return _fc._signal_intensity;
150     }
151 
152     /// Flow index (0-based)
153     size_t flow_index() @property const {
154         return _flow_index;
155     }
156 }
157 
158 /// Get flow calls from signal intensities and flow order.
159 auto flowCalls(short[] intensities, string flow_order, FlowGramTag tag) {
160     
161     static FlowCall flowCall(T)(T call) {
162         return FlowCall(call[0], Base(call[1]), call[2]);
163     }
164 
165     return map!flowCall(zip(intensities, flow_order, repeat(tag)));
166 }
167 
168 struct ReadFlowCallRange(S) 
169     if (!is(S == class))
170 {
171     private {
172         string _flow_order = void;
173         short[] _intensities = void;
174         bool _rev = void;
175         S _sequence = void;
176 
177         int _zf = void;
178         Base _current_base = void;
179         ushort _current_length = void;
180         size_t _current_flow_index;
181         ushort _current_offset;
182 
183         ushort _overlap = void;
184         
185         FlowGramTag _tag = void;
186 
187         bool _empty = false;
188 
189         // consumes next homopolymer from the sequence,
190         // and updates _current_base, _current_flow_index, 
191         // _current_length appropriately
192         void _doSetup() {
193             if (_sequence.empty) {
194                 _empty = true;
195                 return;
196             }
197 
198             _current_length = 1; 
199 
200             // setup current base and current length
201             if (!_rev) {
202                 _current_base = _sequence.front;
203                 _sequence.popFront();
204                 while (!_sequence.empty && _sequence.front == _current_base) {
205                     _sequence.popFront();
206                     ++_current_length;
207                 }
208             } else {
209                 _current_base = _sequence.back; // complement later
210                 _sequence.popBack();            // because of comparison below
211                 while (!_sequence.empty && _sequence.back == _current_base) {
212                     _sequence.popBack();
213                     ++_current_length;
214                 }
215                 _current_base = _current_base.complement;
216             }
217 
218             // setup current flow index
219             for ( ; _current_flow_index < _flow_order.length; ++_current_flow_index) {
220                 if (_flow_order[_current_flow_index] == _current_base) {
221                     break;
222                 }
223             }
224         }
225     }
226 
227     this(S seq, short[] intensities, bool reverse_strand, 
228          string flow_order, ushort first_base_overlap, int zf, FlowGramTag tag) 
229     {
230         _sequence = seq;
231         _intensities = intensities;
232         _rev = reverse_strand;
233         _flow_order = flow_order;
234         _zf = zf;
235         _overlap = first_base_overlap;
236         _tag = tag;
237 
238         if (_sequence.empty) {
239             _empty = true;
240         } else {
241             _doSetup();
242         }
243     }
244 
245     bool empty() @property const {
246         return _empty;
247     }
248 
249     ReadFlowCall front() @property const {
250         enforce(_current_flow_index < _intensities.length,
251                 "Inconsistency between FZ/ZM tag and read bases");
252 
253         auto intensity = cast(ushort)(_intensities[_current_flow_index] - _overlap);
254         ReadFlowCall rfc = void;
255         rfc._fc = FlowCall(intensity, _current_base, _tag);
256         rfc._offset = _current_offset;
257         rfc._called_len = _current_length;
258         rfc._flow_index = cast(ushort)(_current_flow_index + _zf);
259         return rfc;
260     }
261 
262     void popFront() {
263         _current_offset += _current_length;
264 
265         ++_current_flow_index;
266         _overlap = 0; // after first base it is always zero
267 
268         _doSetup();
269     }
270 
271     ReadFlowCallRange!S save() @property {
272         // bitwise copy
273         // FIXME: is it safe?
274         ReadFlowCallRange!S r = this;
275         return r;
276     }
277 }
278 
279 private ReadFlowCallRange!S readFlowCallRange(S)(S seq, short[] intensities, bool rev,
280                                                  string flow_order, ushort overlap, int zf,
281                                                  FlowGramTag tag)
282 {
283     return ReadFlowCallRange!S(seq, intensities, rev, flow_order, overlap, zf, tag);
284 }
285 
286 
287 /// Get read flow calls. Takes ZF tag and strandness into account.
288 ///
289 /// Tag name is an optional argument because it is not standard and will likely
290 /// be changed in the future (there was a proposal on samtools mailing list
291 /// to introduce standard FB tag).
292 auto readFlowCalls(R)(R read, string flow_order, string key_sequence, string tag="ZF") {
293 
294     auto zf = cast(int)read[tag];
295     auto fz_value = read["FZ"];
296     auto zm_value = read["ZM"];
297 
298     enforce(!(fz_value.is_nothing && zm_value.is_nothing),
299             "Neither FZ nor ZM tag is presented in a mapped read");
300 
301     auto fg_tag = fz_value.is_nothing ? FlowGramTag.ZM : FlowGramTag.FZ;
302 
303     short[] flow_int = *cast(short[]*)(fg_tag == FlowGramTag.ZM ? &zm_value : &fz_value);
304 
305     flow_order = flow_order[zf .. $];
306     auto intensities = flow_int[zf .. $];
307 
308     // key sequence is required because its last base can overlap with first called base
309     ushort overlap = 0;
310 
311     Base5 base = read.is_reverse_strand ? read.sequence.back.complement : read.sequence.front;
312     foreach_reverse (c; key_sequence) {
313         if (c != base)
314             break;
315         overlap += cast(int)(multiplier(fg_tag));
316     }
317 
318     return readFlowCallRange(read.sequence, intensities, read.is_reverse_strand,
319                              flow_order, overlap, zf, fg_tag);
320 }