1 module bio.std.hts.snpcallers.maq; 2 3 /* 4 * The code below is based on errmod.c from Samtools. 5 */ 6 7 import core.stdc.math; 8 import std.math : LN2, LN10, isNaN; 9 import std.traits; 10 import std.range; 11 import std.algorithm; 12 import std.random; 13 import std.typecons; 14 15 import bio.std.hts.bam.md.reconstruct; 16 import bio.std.hts.bam.pileup; 17 18 import bio.core.base; 19 import bio.core.genotype; 20 import bio.core.call; 21 import bio.core.tinymap; 22 23 struct BaseWithStrand { 24 immutable ValueSetSize = Base.ValueSetSize * 2; 25 private ubyte _code; 26 ubyte internal_code() @property const { 27 return _code; 28 } 29 30 static BaseWithStrand fromInternalCode(ubyte code) { 31 BaseWithStrand bws = void; 32 bws._code = code; 33 return bws; 34 } 35 36 this(Base b, bool is_reverse) { 37 _code = cast(ubyte)(b.internal_code * 2 + (is_reverse ? 1 : 0)); 38 } 39 40 Base base() @property const { 41 return Base.fromInternalCode(_code / 2); 42 } 43 44 bool is_reverse_strand() @property const { 45 return (_code & 1) == 1; 46 } 47 } 48 49 struct ReadBase { 50 BaseWithStrand base_with_strand; 51 alias base_with_strand this; 52 private ubyte _quality; 53 54 this(Base b, ubyte quality, bool is_reverse) { 55 base_with_strand = BaseWithStrand(b, is_reverse); 56 _quality = quality; 57 } 58 59 ubyte quality() @property const { 60 return _quality; 61 } 62 } 63 64 struct ErrorModelCoefficients { 65 private { 66 67 // _fk[n] = (1 - depcorr)^n * (1 - eta) + eta 68 double[] _fk; 69 70 // _beta[q << 16 | n << 8 | k ] = see MAQ paper for meaning of \beta 71 double[] _beta; 72 73 // _lhet[n << 8 | k] = log(1/2^n * choose(n, k)) 74 double[] _lhet; 75 76 immutable Base[4] nucleotides = [Base('A'), Base('C'), Base('G'), Base('T')]; 77 } 78 79 this(double depcorr, double eta) { 80 _fk.length = 256; 81 _beta.length = 256 * 256 * 64; 82 _lhet.length = 256 * 256; 83 84 foreach (n, ref v; _fk) { 85 v = core.stdc.math.pow(1.0 - depcorr, cast(double)n) * (1.0 - eta) + eta; 86 } 87 88 // lC[n][k] = log(choose(n, k)) 89 double[256][256] lC; 90 91 // lG[n] = logGamma(n + 1) 92 double[256] lG; 93 94 for (size_t n = 0; n <= 255; ++n) { 95 lG[n] = core.stdc.math.lgamma(cast(double)(n + 1)); 96 for (size_t k = 0; k <= n / 2; ++k) { 97 lC[n][n-k] = lC[n][k] = lG[n] - lG[k] - lG[n-k]; 98 99 // fill _lhet simultaneously 100 _lhet[n << 8 | (n-k)] = _lhet[n << 8 | k] = lC[n][k] - n * cast(double)LN2; 101 } 102 } 103 104 for (size_t q = 1; q < 64; ++q) { 105 real e = 10.0 ^^ (-(cast(real)q) / 10.0); 106 real le = core.stdc.math.logl(e); 107 real le1 = core.stdc.math.logl(1.0 - e); 108 109 for (int n = 1; n <= 255; ++n) { 110 real sum, sum1; 111 sum = sum1 = 0.0; 112 for (int k = n; k >= 0; --k) { 113 sum = sum1 + core.stdc.math.expl(lC[n][k] + k * le + (n-k) * le1); 114 _beta[q << 16 | n << 8 | k] = -10.0 / LN10 * core.stdc.math.logl(sum1 / sum); 115 sum1 = sum; 116 } 117 } 118 } 119 } 120 121 double fk(size_t n) const { 122 return _fk[n]; 123 } 124 125 double beta(uint quality, size_t n, size_t k) const { 126 return _beta[quality << 16 | n << 8 | k]; 127 } 128 129 double lhet(size_t n, size_t k) const { 130 return _lhet[n << 8 | k]; 131 } 132 133 alias TinyMap!(DiploidGenotype!Base5, float, useDefaultValue) Dict; 134 135 private immutable C = 10.0 / LN10; 136 137 Dict computeLikelihoods(R)(R read_bases, bool symmetric=false) const 138 if (is(ElementType!R == ReadBase) && hasLength!R) 139 { 140 // if there're more than 255 reads, subsample them 141 ReadBase[255] buf = void; 142 if (read_bases.length > buf.length) { 143 copy(randomSample(read_bases, buf.length), buf[]); 144 } else { 145 copy(read_bases, buf[]); 146 } 147 auto bases = buf[0 .. min(read_bases.length, $)]; 148 149 sort!"a.quality < b.quality"(bases); 150 151 auto w = TinyMap!(BaseWithStrand, uint, fillNoRemove)(0); 152 auto c = TinyMap!(Base, uint, fillNoRemove)(0); 153 auto fsum = TinyMap!(Base, double, fillNoRemove)(0.0); 154 auto bsum = TinyMap!(Base, double, fillNoRemove)(0.0); 155 156 foreach_reverse (ref read_base; bases) { 157 auto quality = read_base.quality; 158 if (quality < 4) quality = 4; 159 if (quality > 63) quality = 63; 160 161 auto bws = read_base.base_with_strand; 162 auto b = bws.base; 163 164 fsum[b] += fk(w[bws]); 165 bsum[b] += fk(w[bws]) * beta(quality, bases.length, c[b]); 166 c[b] += 1; 167 w[bws] += 1; 168 } 169 170 alias diploidGenotype dG; 171 172 auto q = Dict(float.min); 173 174 foreach (i, b1; nucleotides) { 175 float tmp1 = 0.0; 176 int tmp2; 177 float tmp3 = 0.0; 178 179 // homozygous 180 foreach (k, b2; nucleotides) { 181 if (k != i) { 182 tmp1 += bsum[b2]; 183 tmp2 += c[b2]; 184 tmp3 += fsum[b2]; 185 } 186 } 187 188 auto b1_5 = cast(Base5)b1; 189 if (tmp2 > 0) { 190 q[dG(b1_5)] = tmp1; 191 } else { 192 q[dG(b1_5)] = 0.0; 193 } 194 195 // heterozygous 196 for (size_t j = i + 1; j < nucleotides.length; ++j) { 197 auto b2 = nucleotides[j]; 198 int cij = c[b1] + c[b2]; 199 tmp1 = tmp3 = 0.0; 200 tmp2 = 0; 201 foreach (k, b3; nucleotides) { 202 if (k != i && k != j) { 203 tmp1 += bsum[b3]; 204 tmp2 += c[b3]; 205 tmp3 += fsum[b3]; 206 } 207 } 208 209 auto b2_5 = cast(Base5)b2; 210 if (tmp2 > 0) { 211 q[dG(b2_5, b1_5)] = tmp1 - C * lhet(cij, c[b2]); 212 } else { 213 q[dG(b2_5, b1_5)] = -C * lhet(cij, c[b2]); 214 } 215 216 if (symmetric) { 217 q[dG(b1_5, b2_5)] = q[dG(b2_5, b1_5)]; 218 } 219 } 220 221 foreach (k, b2; nucleotides) { 222 auto g = dG(b1_5, cast(Base5)b2); 223 if (g in q) { 224 if (q[g] < 0.0) q[g] = 0.0; 225 } 226 } 227 } 228 229 return q; 230 } 231 } 232 233 // Encapsulates information about genotype likelihoods at a site. 234 struct GenotypeLikelihoodInfo { 235 236 alias ErrorModelCoefficients.Dict ScoreDict; 237 238 alias DiploidGenotype!Base5 Gt; 239 240 this(ScoreDict dict) { 241 242 _dict = dict; 243 size_t k = 0; 244 245 // copy all data into a buffer, combining that with insertion sort 246 foreach (gt, score; _dict) { 247 if (k == 0) { 248 gt_buf[k++] = gt; 249 } else { 250 size_t j = k; 251 while (j > 0 && _dict[gt_buf[j-1]] > score) { 252 gt_buf[j] = gt_buf[j-1]; 253 --j; 254 } 255 gt_buf[j] = gt; 256 ++k; 257 } 258 } 259 260 assert(k >= 2); 261 262 _count = cast(ubyte)k; 263 } 264 265 size_t count() @property const { 266 return _count; 267 } 268 269 static struct GtInfo { 270 private { 271 Gt _gt; 272 float _prob; 273 } 274 275 Gt genotype() @property const { 276 return _gt; 277 } 278 279 float score() @property const { 280 return _prob; 281 } 282 } 283 284 GtInfo opIndex(size_t index) { 285 assert(index < count); 286 auto gt = gt_buf[index]; 287 return GtInfo(gt, _dict[gt]); 288 } 289 290 private Gt[25] gt_buf; 291 private ubyte _count; 292 private ScoreDict _dict; 293 } 294 295 class ErrorModel { 296 297 private { 298 float _depcorr; 299 float _eta; 300 ErrorModelCoefficients _coef; 301 } 302 303 this(float depcorr, float eta=0.03) { 304 _depcorr = depcorr; 305 _eta = eta; 306 _coef = ErrorModelCoefficients(_depcorr, _eta); 307 } 308 309 const(ErrorModelCoefficients) coefficients() @property const { 310 return _coef; 311 } 312 313 alias coefficients this; 314 } 315 316 /// Class for calling SNPs using MAQ model. 317 /// 318 /// Typical usage: 319 /// auto caller = new MaqSnpCaller(); 320 /// caller.minimum_call_quality = 20.0f; 321 /// caller.minimum_base_quality = 13; 322 /// foreach (snp; caller.findSNPs(reads)) { ... } 323 /// 324 final class MaqSnpCaller { 325 326 private float _depcorr = 0.17; 327 private float _eta = 0.03; 328 private float _minimum_call_quality = 6.0; 329 private ubyte _minimum_base_quality = 13; 330 private bool _need_to_recompute_errmod = true; 331 332 /// 333 float depcorr() @property const { 334 return _depcorr; 335 } 336 337 /// ditto 338 void depcorr(float f) @property { 339 _depcorr = f; 340 _need_to_recompute_errmod = true; 341 } 342 343 /// 344 float eta() @property const { 345 return _eta; 346 } 347 348 /// 349 void eta(float f) @property { 350 _eta = f; 351 _need_to_recompute_errmod = true; 352 } 353 354 /// Minimum call quality 355 float minimum_call_quality() @property const { 356 return _minimum_call_quality; 357 } 358 359 /// ditto 360 void minimum_call_quality(float f) @property { 361 _minimum_call_quality = f; 362 } 363 364 /// Discard reads with base quality less than this at a site 365 ubyte minimum_base_quality() @property const { 366 return _minimum_base_quality; 367 } 368 369 void minimum_base_quality(ubyte q) @property { 370 _minimum_base_quality = q; 371 } 372 373 ErrorModel errmod() @property { 374 if (_need_to_recompute_errmod) { 375 synchronized { 376 if (_need_to_recompute_errmod) { 377 _errmod = new ErrorModel(_depcorr, _eta); 378 _need_to_recompute_errmod = false; 379 } 380 } 381 } 382 return _errmod; 383 } 384 385 private ErrorModel _errmod; 386 387 /// Get genotype likelihoods 388 final GenotypeLikelihoodInfo genotypeLikelihoodInfo(C)(C column) { 389 390 version(MaqCaller8192) { 391 ReadBase[8192] buf = void; 392 } 393 394 size_t num_of_valid_bases = 0; 395 396 foreach (read; column.reads) { 397 398 version(MaqCaller8192) { 399 if (num_of_valid_bases == 8192) break; 400 } 401 402 if (read.current_base_quality < minimum_base_quality) 403 continue; 404 if (read.current_base == '-') 405 continue; 406 407 version(MaqCaller8192) { 408 buf[num_of_valid_bases] = ReadBase(Base(read.current_base), 409 min(read.current_base_quality, read.mapping_quality), 410 read.is_reverse_strand); 411 } 412 413 num_of_valid_bases++; 414 } 415 416 static struct ReadBaseRange(R) { 417 private R _reads = void; 418 private ubyte minimum_base_quality = void; 419 420 this(R reads, ubyte minbq) { 421 _reads = reads; minimum_base_quality = minbq; _findNextValid(); 422 } 423 424 ReadBase front() @property { 425 auto read = _reads.front; 426 return ReadBase(Base(read.current_base), 427 min(read.current_base_quality, read.mapping_quality), 428 read.is_reverse_strand); 429 } 430 bool empty() @property { return _reads.empty; } 431 void popFront() { _reads.popFront(); _findNextValid(); } 432 ReadBaseRange save() @property { return ReadBaseRange!R(_reads, minimum_base_quality); } 433 434 private void _findNextValid() { 435 while (!_reads.empty && 436 (_reads.front.current_base_quality < minimum_base_quality || 437 _reads.front.current_base == '-')) 438 { 439 _reads.popFront(); 440 } 441 } 442 } 443 444 if (num_of_valid_bases == 0) { 445 GenotypeLikelihoodInfo result; 446 return result; 447 } 448 449 version(MaqCaller8192) { 450 ReadBase[] rbs = buf[0 .. num_of_valid_bases]; 451 auto likelihood_dict = errmod.computeLikelihoods(rbs); 452 } else { 453 auto rbs = ReadBaseRange!(typeof(column.reads))(column.reads, minimum_base_quality); 454 auto likelihood_dict = errmod.computeLikelihoods(takeExactly(rbs, num_of_valid_bases)); 455 } 456 return GenotypeLikelihoodInfo(likelihood_dict); 457 } 458 459 /// Make call on a pileup column 460 final Nullable!DiploidCall5 makeCall(C)(C column, string reference="", string sample="") { 461 462 auto gts = genotypeLikelihoodInfo(column); 463 464 Nullable!DiploidCall5 result; 465 466 if (gts.count < 2) return result; 467 468 static if (__traits(compiles, column.reference_base)) { 469 auto refbase = Base5(column.reference_base); 470 } else { 471 auto refbase = Base5('N'); 472 } 473 474 if (sample == "") { 475 auto rg = column.reads.front["RG"]; 476 if (!rg.is_nothing) { 477 sample = cast(string)rg; 478 } 479 } 480 481 result = DiploidCall5(sample, reference, column.position, 482 refbase, gts[0].genotype, 483 gts[1].score - gts[0].score); 484 485 return result; 486 } 487 488 /// main method of this class 489 auto findSNPs(P)(P pileup_columns, string reference="", string sample="") { 490 static assert(__traits(compiles, {pileup_columns.front.reference_base;})); 491 492 static struct Result { 493 private MaqSnpCaller _caller; 494 private P _pileup; 495 private DiploidCall5 _front; 496 private bool _empty; 497 private string _reference; 498 private string _sample; 499 500 this(MaqSnpCaller caller, P pileup, string reference, string sample) { 501 _caller = caller; 502 _pileup = pileup; 503 _reference = reference; 504 _sample = sample; 505 _fetchNextSNP(); 506 } 507 508 DiploidCall5 front() @property { 509 return _front; 510 } 511 512 bool empty() @property { 513 return _empty; 514 } 515 516 void popFront() { 517 _pileup.popFront(); 518 _fetchNextSNP(); 519 } 520 521 private void _fetchNextSNP() { 522 while (true) { 523 if (_pileup.empty) { 524 _empty = true; 525 break; 526 } 527 528 auto call = _caller.makeCall(_pileup.front, _reference, _sample); 529 if (!call.isNull && call.is_variant && call.quality > _caller.minimum_call_quality) { 530 _front = call.get; 531 break; 532 } else { 533 _pileup.popFront(); 534 } 535 } 536 } 537 } 538 539 return Result(this, pileup_columns, reference, sample); 540 } 541 }