1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.std.hts.iontorrent.flowcall;
25 import bio.std.hts.bam.tagvalue;
26 import bio.std.hts.iontorrent.flowindex;
27 
28 import bio.core.base;
29 import bio.core.utils.range;
30 
31 import std.array;
32 import std.typecons;
33 import std.range;
34 import std.algorithm;
35 import std.exception;
36 
37 /// Tag where flow signal intensities are stored
38 enum FlowGramTag : ubyte {
39     FZ,
40     ZM
41 }
42 
43 /// Scale of intensity values
44 float multiplier(FlowGramTag tag) {
45     return tag == FlowGramTag.FZ ? 100.0 : 256.0;
46 }
47 
48 /// Flow base call
49 struct FlowCall {
50     private {
51         short _signal_intensity;
52 
53         static assert(Base.ValueSetSize <= 16 && FlowGramTag.max < 16,
54                       "implementation of FlowCall should be changed?");
55 
56         ubyte _storage;  // tag in upper 4 bits, base in lower 4 bits
57 
58         Base _base() @property const {
59             return Base.fromInternalCode(_storage & 0xF);
60         }
61 
62         void _base(Base b) @property {
63             _storage &= 0xF0;
64             _storage |= b.internal_code;
65         }
66 
67         FlowGramTag _tag() @property const {
68             return cast(FlowGramTag)(_storage >> 4);
69         }
70 
71         void _tag(FlowGramTag tag) @property {
72             _storage &= 0xF;
73             _storage |= (cast(ubyte)tag << 4);
74         }
75 
76         this(short signal_intensity, Base b, FlowGramTag tag) {
77             _signal_intensity = signal_intensity;
78             _storage = cast(ubyte)(b.internal_code | (tag << 4));
79         }
80     }
81 
82     /// Nucleotide
83     Base base() @property const {
84         return _base;
85     }
86 
87     /// Signal intensity, normalized to homopolymer lengths
88     float intensity() @property const {
89         return _signal_intensity / multiplier(_tag);
90     }
91 
92     /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tag,
93     /// and 256.0 for ZM tag.
94     /// More efficient, because this is how intensities are stored in FZ/ZM tag.
95     short intensity_value() @property const {
96         return _signal_intensity;
97     }
98 }
99 
100 /// Flow call associated with a read
101 struct ReadFlowCall {
102     private {
103         FlowCall _fc;
104         ushort _offset;
105         ushort _called_len;
106         ushort _flow_index;
107 
108         this(Base b, short signal_intensity, ushort offset,
109                      ushort called, ushort flow_index, FlowGramTag tag)
110         {
111             _fc = FlowCall(signal_intensity, b, tag);
112             _offset = offset;
113             _called_len = called;
114             _flow_index = flow_index;
115         }
116     }
117 
118     /// Called nucleotide
119     Base base() @property const {
120         return _fc._base;
121     }
122 
123     /// Set base to its complement
124     void complement() {
125         _fc._base = _fc._base.complement;
126     }
127 
128     /// Called homopolymer length
129     ushort length() @property const {
130         return _called_len;
131     }
132 
133     /// Zero-based position of the first nucleotide in the run,      
134     /// relative to start of the read. Takes strandness into account.
135     ushort offset() @property const {
136         return _offset;
137     }
138 
139     /// Signal intensity, normalized to homopolymer lengths
140     float intensity() @property const {
141         return _fc.intensity;
142     }
143 
144     /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tags,
145     /// and 256.0 for ZM tags.
146     /// More efficient, because this is how intensities are stored in FZ/ZM tag.
147     short intensity_value() @property const {
148         return _fc._signal_intensity;
149     }
150 
151     /// Flow index (0-based)
152     size_t flow_index() @property const {
153         return _flow_index;
154     }
155 }
156 
157 /// Get flow calls from signal intensities and flow order.
158 auto flowCalls(short[] intensities, string flow_order, FlowGramTag tag) {
159     
160     static FlowCall flowCall(T)(T call) {
161         return FlowCall(call[0], Base(call[1]), call[2]);
162     }
163 
164     return map!flowCall(zip(intensities, flow_order, repeat(tag)));
165 }
166 
167 struct ReadFlowCallRange(S) 
168     if (!is(S == class))
169 {
170     private {
171         string _flow_order = void;
172         short[] _intensities = void;
173         bool _rev = void;
174         S _sequence = void;
175 
176         int _zf = void;
177         Base _current_base = void;
178         ushort _current_length = void;
179         size_t _current_flow_index;
180         ushort _current_offset;
181 
182         ushort _overlap = void;
183         
184         FlowGramTag _tag = void;
185 
186         bool _empty = false;
187 
188         // consumes next homopolymer from the sequence,
189         // and updates _current_base, _current_flow_index, 
190         // _current_length appropriately
191         void _doSetup() {
192             if (_sequence.empty) {
193                 _empty = true;
194                 return;
195             }
196 
197             _current_length = 1; 
198 
199             // setup current base and current length
200             if (!_rev) {
201                 _current_base = _sequence.front;
202                 _sequence.popFront();
203                 while (!_sequence.empty && _sequence.front == _current_base) {
204                     _sequence.popFront();
205                     ++_current_length;
206                 }
207             } else {
208                 _current_base = _sequence.back; // complement later
209                 _sequence.popBack();            // because of comparison below
210                 while (!_sequence.empty && _sequence.back == _current_base) {
211                     _sequence.popBack();
212                     ++_current_length;
213                 }
214                 _current_base = _current_base.complement;
215             }
216 
217             // setup current flow index
218             for ( ; _current_flow_index < _flow_order.length; ++_current_flow_index) {
219                 if (_flow_order[_current_flow_index] == _current_base) {
220                     break;
221                 }
222             }
223         }
224     }
225 
226     this(S seq, short[] intensities, bool reverse_strand, 
227          string flow_order, ushort first_base_overlap, int zf, FlowGramTag tag) 
228     {
229         _sequence = seq;
230         _intensities = intensities;
231         _rev = reverse_strand;
232         _flow_order = flow_order;
233         _zf = zf;
234         _overlap = first_base_overlap;
235         _tag = tag;
236 
237         if (_sequence.empty) {
238             _empty = true;
239         } else {
240             _doSetup();
241         }
242     }
243 
244     bool empty() @property const {
245         return _empty;
246     }
247 
248     ReadFlowCall front() @property const {
249         enforce(_current_flow_index < _intensities.length,
250                 "Inconsistency between FZ/ZM tag and read bases");
251 
252         auto intensity = cast(ushort)(_intensities[_current_flow_index] - _overlap);
253         ReadFlowCall rfc = void;
254         rfc._fc = FlowCall(intensity, _current_base, _tag);
255         rfc._offset = _current_offset;
256         rfc._called_len = _current_length;
257         rfc._flow_index = cast(ushort)(_current_flow_index + _zf);
258         return rfc;
259     }
260 
261     void popFront() {
262         _current_offset += _current_length;
263 
264         ++_current_flow_index;
265         _overlap = 0; // after first base it is always zero
266 
267         _doSetup();
268     }
269 
270     ReadFlowCallRange!S save() @property {
271         // bitwise copy
272         // FIXME: is it safe?
273         ReadFlowCallRange!S r = this;
274         return r;
275     }
276 }
277 
278 private ReadFlowCallRange!S readFlowCallRange(S)(S seq, short[] intensities, bool rev,
279                                                  string flow_order, ushort overlap, int zf,
280                                                  FlowGramTag tag)
281 {
282     return ReadFlowCallRange!S(seq, intensities, rev, flow_order, overlap, zf, tag);
283 }
284 
285 
286 /// Get read flow calls. Takes ZF tag and strandness into account.
287 ///
288 /// Tag name is an optional argument because it is not standard and will likely
289 /// be changed in the future (there was a proposal on samtools mailing list
290 /// to introduce standard FB tag).
291 auto readFlowCalls(R)(R read, string flow_order, string key_sequence, string tag="ZF") {
292 
293     auto zf = cast(int)read[tag];
294     auto fz_value = read["FZ"];
295     auto zm_value = read["ZM"];
296 
297     enforce(!(fz_value.is_nothing && zm_value.is_nothing),
298             "Neither FZ nor ZM tag is presented in a mapped read");
299 
300     auto fg_tag = fz_value.is_nothing ? FlowGramTag.ZM : FlowGramTag.FZ;
301 
302     short[] flow_int = *cast(short[]*)(fg_tag == FlowGramTag.ZM ? &zm_value : &fz_value);
303 
304     flow_order = flow_order[zf .. $];
305     auto intensities = flow_int[zf .. $];
306 
307     // key sequence is required because its last base can overlap with first called base
308     ushort overlap = 0;
309 
310     Base5 base = read.is_reverse_strand ? read.sequence.back.complement : read.sequence.front;
311     foreach_reverse (c; key_sequence) {
312         if (c != base)
313             break;
314         overlap += cast(int)(multiplier(fg_tag));
315     }
316 
317     return readFlowCallRange(read.sequence, intensities, read.is_reverse_strand,
318                              flow_order, overlap, zf, fg_tag);
319 }