1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.std.hts.iontorrent.flowcall; 25 import bio.std.hts.bam.tagvalue; 26 import bio.std.hts.iontorrent.flowindex; 27 28 import bio.core.base; 29 import bio.core.utils.range; 30 31 import std.array; 32 import std.typecons; 33 import std.range; 34 import std.algorithm; 35 import std.exception; 36 37 /// Tag where flow signal intensities are stored 38 enum FlowGramTag : ubyte { 39 FZ, 40 ZM 41 } 42 43 /// Scale of intensity values 44 float multiplier(FlowGramTag tag) { 45 return tag == FlowGramTag.FZ ? 100.0 : 256.0; 46 } 47 48 /// Flow base call 49 struct FlowCall { 50 private { 51 short _signal_intensity; 52 53 static assert(Base.ValueSetSize <= 16 && FlowGramTag.max < 16, 54 "implementation of FlowCall should be changed?"); 55 56 ubyte _storage; // tag in upper 4 bits, base in lower 4 bits 57 58 Base _base() @property const { 59 return Base.fromInternalCode(_storage & 0xF); 60 } 61 62 void _base(Base b) @property { 63 _storage &= 0xF0; 64 _storage |= b.internal_code; 65 } 66 67 FlowGramTag _tag() @property const { 68 return cast(FlowGramTag)(_storage >> 4); 69 } 70 71 void _tag(FlowGramTag tag) @property { 72 _storage &= 0xF; 73 _storage |= (cast(ubyte)tag << 4); 74 } 75 76 this(short signal_intensity, Base b, FlowGramTag tag) { 77 _signal_intensity = signal_intensity; 78 _storage = cast(ubyte)(b.internal_code | (tag << 4)); 79 } 80 } 81 82 /// Nucleotide 83 Base base() @property const { 84 return _base; 85 } 86 87 /// Signal intensity, normalized to homopolymer lengths 88 float intensity() @property const { 89 return _signal_intensity / multiplier(_tag); 90 } 91 92 /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tag, 93 /// and 256.0 for ZM tag. 94 /// More efficient, because this is how intensities are stored in FZ/ZM tag. 95 short intensity_value() @property const { 96 return _signal_intensity; 97 } 98 } 99 100 /// Flow call associated with a read 101 struct ReadFlowCall { 102 private { 103 FlowCall _fc; 104 ushort _offset; 105 ushort _called_len; 106 ushort _flow_index; 107 108 this(Base b, short signal_intensity, ushort offset, 109 ushort called, ushort flow_index, FlowGramTag tag) 110 { 111 _fc = FlowCall(signal_intensity, b, tag); 112 _offset = offset; 113 _called_len = called; 114 _flow_index = flow_index; 115 } 116 } 117 118 /// Called nucleotide 119 Base base() @property const { 120 return _fc._base; 121 } 122 123 /// Set base to its complement 124 void complement() { 125 _fc._base = _fc._base.complement; 126 } 127 128 /// Called homopolymer length 129 ushort length() @property const { 130 return _called_len; 131 } 132 133 /// Zero-based position of the first nucleotide in the run, 134 /// relative to start of the read. Takes strandness into account. 135 ushort offset() @property const { 136 return _offset; 137 } 138 139 /// Signal intensity, normalized to homopolymer lengths 140 float intensity() @property const { 141 return _fc.intensity; 142 } 143 144 /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tags, 145 /// and 256.0 for ZM tags. 146 /// More efficient, because this is how intensities are stored in FZ/ZM tag. 147 short intensity_value() @property const { 148 return _fc._signal_intensity; 149 } 150 151 /// Flow index (0-based) 152 size_t flow_index() @property const { 153 return _flow_index; 154 } 155 } 156 157 /// Get flow calls from signal intensities and flow order. 158 auto flowCalls(short[] intensities, string flow_order, FlowGramTag tag) { 159 160 static FlowCall flowCall(T)(T call) { 161 return FlowCall(call[0], Base(call[1]), call[2]); 162 } 163 164 return map!flowCall(zip(intensities, flow_order, repeat(tag))); 165 } 166 167 struct ReadFlowCallRange(S) 168 if (!is(S == class)) 169 { 170 private { 171 string _flow_order = void; 172 short[] _intensities = void; 173 bool _rev = void; 174 S _sequence = void; 175 176 int _zf = void; 177 Base _current_base = void; 178 ushort _current_length = void; 179 size_t _current_flow_index; 180 ushort _current_offset; 181 182 ushort _overlap = void; 183 184 FlowGramTag _tag = void; 185 186 bool _empty = false; 187 188 // consumes next homopolymer from the sequence, 189 // and updates _current_base, _current_flow_index, 190 // _current_length appropriately 191 void _doSetup() { 192 if (_sequence.empty) { 193 _empty = true; 194 return; 195 } 196 197 _current_length = 1; 198 199 // setup current base and current length 200 if (!_rev) { 201 _current_base = _sequence.front; 202 _sequence.popFront(); 203 while (!_sequence.empty && _sequence.front == _current_base) { 204 _sequence.popFront(); 205 ++_current_length; 206 } 207 } else { 208 _current_base = _sequence.back; // complement later 209 _sequence.popBack(); // because of comparison below 210 while (!_sequence.empty && _sequence.back == _current_base) { 211 _sequence.popBack(); 212 ++_current_length; 213 } 214 _current_base = _current_base.complement; 215 } 216 217 // setup current flow index 218 for ( ; _current_flow_index < _flow_order.length; ++_current_flow_index) { 219 if (_flow_order[_current_flow_index] == _current_base) { 220 break; 221 } 222 } 223 } 224 } 225 226 this(S seq, short[] intensities, bool reverse_strand, 227 string flow_order, ushort first_base_overlap, int zf, FlowGramTag tag) 228 { 229 _sequence = seq; 230 _intensities = intensities; 231 _rev = reverse_strand; 232 _flow_order = flow_order; 233 _zf = zf; 234 _overlap = first_base_overlap; 235 _tag = tag; 236 237 if (_sequence.empty) { 238 _empty = true; 239 } else { 240 _doSetup(); 241 } 242 } 243 244 bool empty() @property const { 245 return _empty; 246 } 247 248 ReadFlowCall front() @property const { 249 enforce(_current_flow_index < _intensities.length, 250 "Inconsistency between FZ/ZM tag and read bases"); 251 252 auto intensity = cast(ushort)(_intensities[_current_flow_index] - _overlap); 253 ReadFlowCall rfc = void; 254 rfc._fc = FlowCall(intensity, _current_base, _tag); 255 rfc._offset = _current_offset; 256 rfc._called_len = _current_length; 257 rfc._flow_index = cast(ushort)(_current_flow_index + _zf); 258 return rfc; 259 } 260 261 void popFront() { 262 _current_offset += _current_length; 263 264 ++_current_flow_index; 265 _overlap = 0; // after first base it is always zero 266 267 _doSetup(); 268 } 269 270 ReadFlowCallRange!S save() @property { 271 // bitwise copy 272 // FIXME: is it safe? 273 ReadFlowCallRange!S r = this; 274 return r; 275 } 276 } 277 278 private ReadFlowCallRange!S readFlowCallRange(S)(S seq, short[] intensities, bool rev, 279 string flow_order, ushort overlap, int zf, 280 FlowGramTag tag) 281 { 282 return ReadFlowCallRange!S(seq, intensities, rev, flow_order, overlap, zf, tag); 283 } 284 285 286 /// Get read flow calls. Takes ZF tag and strandness into account. 287 /// 288 /// Tag name is an optional argument because it is not standard and will likely 289 /// be changed in the future (there was a proposal on samtools mailing list 290 /// to introduce standard FB tag). 291 auto readFlowCalls(R)(R read, string flow_order, string key_sequence, string tag="ZF") { 292 293 auto zf = cast(int)read[tag]; 294 auto fz_value = read["FZ"]; 295 auto zm_value = read["ZM"]; 296 297 enforce(!(fz_value.is_nothing && zm_value.is_nothing), 298 "Neither FZ nor ZM tag is presented in a mapped read"); 299 300 auto fg_tag = fz_value.is_nothing ? FlowGramTag.ZM : FlowGramTag.FZ; 301 302 short[] flow_int = *cast(short[]*)(fg_tag == FlowGramTag.ZM ? &zm_value : &fz_value); 303 304 flow_order = flow_order[zf .. $]; 305 auto intensities = flow_int[zf .. $]; 306 307 // key sequence is required because its last base can overlap with first called base 308 ushort overlap = 0; 309 310 Base5 base = read.is_reverse_strand ? read.sequence.back.complement : read.sequence.front; 311 foreach_reverse (c; key_sequence) { 312 if (c != base) 313 break; 314 overlap += cast(int)(multiplier(fg_tag)); 315 } 316 317 return readFlowCallRange(read.sequence, intensities, read.is_reverse_strand, 318 flow_order, overlap, zf, fg_tag); 319 }