1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.bam.iontorrent.flowcall; 25 26 import bio.bam.tagvalue; 27 import bio.bam.iontorrent.flowindex; 28 29 import bio.core.base; 30 import bio.core.utils.range; 31 32 import std.array; 33 import std.typecons; 34 import std.range; 35 import std.algorithm; 36 import std.exception; 37 38 /// Tag where flow signal intensities are stored 39 enum FlowGramTag : ubyte { 40 FZ, 41 ZM 42 } 43 44 /// Scale of intensity values 45 float multiplier(FlowGramTag tag) { 46 return tag == FlowGramTag.FZ ? 100.0 : 256.0; 47 } 48 49 /// Flow base call 50 struct FlowCall { 51 private { 52 short _signal_intensity; 53 54 static assert(Base.ValueSetSize <= 16 && FlowGramTag.max < 16, 55 "implementation of FlowCall should be changed?"); 56 57 ubyte _storage; // tag in upper 4 bits, base in lower 4 bits 58 59 Base _base() @property const { 60 return Base.fromInternalCode(_storage & 0xF); 61 } 62 63 void _base(Base b) @property { 64 _storage &= 0xF0; 65 _storage |= b.internal_code; 66 } 67 68 FlowGramTag _tag() @property const { 69 return cast(FlowGramTag)(_storage >> 4); 70 } 71 72 void _tag(FlowGramTag tag) @property { 73 _storage &= 0xF; 74 _storage |= (cast(ubyte)tag << 4); 75 } 76 77 this(short signal_intensity, Base b, FlowGramTag tag) { 78 _signal_intensity = signal_intensity; 79 _storage = cast(ubyte)(b.internal_code | (tag << 4)); 80 } 81 } 82 83 /// Nucleotide 84 Base base() @property const { 85 return _base; 86 } 87 88 /// Signal intensity, normalized to homopolymer lengths 89 float intensity() @property const { 90 return _signal_intensity / multiplier(_tag); 91 } 92 93 /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tag, 94 /// and 256.0 for ZM tag. 95 /// More efficient, because this is how intensities are stored in FZ/ZM tag. 96 short intensity_value() @property const { 97 return _signal_intensity; 98 } 99 } 100 101 /// Flow call associated with a read 102 struct ReadFlowCall { 103 private { 104 FlowCall _fc; 105 ushort _offset; 106 ushort _called_len; 107 ushort _flow_index; 108 109 this(Base b, short signal_intensity, ushort offset, 110 ushort called, ushort flow_index, FlowGramTag tag) 111 { 112 _fc = FlowCall(signal_intensity, b, tag); 113 _offset = offset; 114 _called_len = called; 115 _flow_index = flow_index; 116 } 117 } 118 119 /// Called nucleotide 120 Base base() @property const { 121 return _fc._base; 122 } 123 124 /// Set base to its complement 125 void complement() { 126 _fc._base = _fc._base.complement; 127 } 128 129 /// Called homopolymer length 130 ushort length() @property const { 131 return _called_len; 132 } 133 134 /// Zero-based position of the first nucleotide in the run, 135 /// relative to start of the read. Takes strandness into account. 136 ushort offset() @property const { 137 return _offset; 138 } 139 140 /// Signal intensity, normalized to homopolymer lengths 141 float intensity() @property const { 142 return _fc.intensity; 143 } 144 145 /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tags, 146 /// and 256.0 for ZM tags. 147 /// More efficient, because this is how intensities are stored in FZ/ZM tag. 148 short intensity_value() @property const { 149 return _fc._signal_intensity; 150 } 151 152 /// Flow index (0-based) 153 size_t flow_index() @property const { 154 return _flow_index; 155 } 156 } 157 158 /// Get flow calls from signal intensities and flow order. 159 auto flowCalls(short[] intensities, string flow_order, FlowGramTag tag) { 160 161 static FlowCall flowCall(T)(T call) { 162 return FlowCall(call[0], Base(call[1]), call[2]); 163 } 164 165 return map!flowCall(zip(intensities, flow_order, repeat(tag))); 166 } 167 168 struct ReadFlowCallRange(S) 169 if (!is(S == class)) 170 { 171 private { 172 string _flow_order = void; 173 short[] _intensities = void; 174 bool _rev = void; 175 S _sequence = void; 176 177 int _zf = void; 178 Base _current_base = void; 179 ushort _current_length = void; 180 size_t _current_flow_index; 181 ushort _current_offset; 182 183 ushort _overlap = void; 184 185 FlowGramTag _tag = void; 186 187 bool _empty = false; 188 189 // consumes next homopolymer from the sequence, 190 // and updates _current_base, _current_flow_index, 191 // _current_length appropriately 192 void _doSetup() { 193 if (_sequence.empty) { 194 _empty = true; 195 return; 196 } 197 198 _current_length = 1; 199 200 // setup current base and current length 201 if (!_rev) { 202 _current_base = _sequence.front; 203 _sequence.popFront(); 204 while (!_sequence.empty && _sequence.front == _current_base) { 205 _sequence.popFront(); 206 ++_current_length; 207 } 208 } else { 209 _current_base = _sequence.back; // complement later 210 _sequence.popBack(); // because of comparison below 211 while (!_sequence.empty && _sequence.back == _current_base) { 212 _sequence.popBack(); 213 ++_current_length; 214 } 215 _current_base = _current_base.complement; 216 } 217 218 // setup current flow index 219 for ( ; _current_flow_index < _flow_order.length; ++_current_flow_index) { 220 if (_flow_order[_current_flow_index] == _current_base) { 221 break; 222 } 223 } 224 } 225 } 226 227 this(S seq, short[] intensities, bool reverse_strand, 228 string flow_order, ushort first_base_overlap, int zf, FlowGramTag tag) 229 { 230 _sequence = seq; 231 _intensities = intensities; 232 _rev = reverse_strand; 233 _flow_order = flow_order; 234 _zf = zf; 235 _overlap = first_base_overlap; 236 _tag = tag; 237 238 if (_sequence.empty) { 239 _empty = true; 240 } else { 241 _doSetup(); 242 } 243 } 244 245 bool empty() @property const { 246 return _empty; 247 } 248 249 ReadFlowCall front() @property const { 250 enforce(_current_flow_index < _intensities.length, 251 "Inconsistency between FZ/ZM tag and read bases"); 252 253 auto intensity = cast(ushort)(_intensities[_current_flow_index] - _overlap); 254 ReadFlowCall rfc = void; 255 rfc._fc = FlowCall(intensity, _current_base, _tag); 256 rfc._offset = _current_offset; 257 rfc._called_len = _current_length; 258 rfc._flow_index = cast(ushort)(_current_flow_index + _zf); 259 return rfc; 260 } 261 262 void popFront() { 263 _current_offset += _current_length; 264 265 ++_current_flow_index; 266 _overlap = 0; // after first base it is always zero 267 268 _doSetup(); 269 } 270 271 ReadFlowCallRange!S save() @property { 272 // bitwise copy 273 // FIXME: is it safe? 274 ReadFlowCallRange!S r = this; 275 return r; 276 } 277 } 278 279 private ReadFlowCallRange!S readFlowCallRange(S)(S seq, short[] intensities, bool rev, 280 string flow_order, ushort overlap, int zf, 281 FlowGramTag tag) 282 { 283 return ReadFlowCallRange!S(seq, intensities, rev, flow_order, overlap, zf, tag); 284 } 285 286 287 /// Get read flow calls. Takes ZF tag and strandness into account. 288 /// 289 /// Tag name is an optional argument because it is not standard and will likely 290 /// be changed in the future (there was a proposal on samtools mailing list 291 /// to introduce standard FB tag). 292 auto readFlowCalls(R)(R read, string flow_order, string key_sequence, string tag="ZF") { 293 294 auto zf = cast(int)read[tag]; 295 auto fz_value = read["FZ"]; 296 auto zm_value = read["ZM"]; 297 298 enforce(!(fz_value.is_nothing && zm_value.is_nothing), 299 "Neither FZ nor ZM tag is presented in a mapped read"); 300 301 auto fg_tag = fz_value.is_nothing ? FlowGramTag.ZM : FlowGramTag.FZ; 302 303 short[] flow_int = *cast(short[]*)(fg_tag == FlowGramTag.ZM ? &zm_value : &fz_value); 304 305 flow_order = flow_order[zf .. $]; 306 auto intensities = flow_int[zf .. $]; 307 308 // key sequence is required because its last base can overlap with first called base 309 ushort overlap = 0; 310 311 Base5 base = read.is_reverse_strand ? read.sequence.back.complement : read.sequence.front; 312 foreach_reverse (c; key_sequence) { 313 if (c != base) 314 break; 315 overlap += cast(int)(multiplier(fg_tag)); 316 } 317 318 return readFlowCallRange(read.sequence, intensities, read.is_reverse_strand, 319 flow_order, overlap, zf, fg_tag); 320 }