1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.std.hts.bam.validation.alignment; 25 26 public import bio.std.hts.bam.read; 27 public import bio.std.hts.bam.tagvalue; 28 import bio.core.utils.algo; 29 30 import std.algorithm; 31 import std.ascii; 32 import std.conv; 33 import std.typetuple; 34 35 /** 36 Alignment validation error types. 37 38 InvalidCigar error is accompanied by some CigarError, 39 InvalidTags is accompanied by some TagError. 40 */ 41 enum AlignmentError { 42 EmptyReadName, /// 43 TooLongReadName, /// 44 ReadNameContainsInvalidCharacters, /// 45 PositionIsOutOfRange, /// 46 QualityDataContainsInvalidElements, /// 47 InvalidCigar, /// 48 InvalidTags, /// 49 DuplicateTagKeys /// 50 } 51 52 /// CIGAR string validation error types. 53 enum CigarError { 54 InternalHardClipping, /// 55 InternalSoftClipping, /// 56 InconsistentLength /// 57 } 58 59 /// Auxiliary data validation error types. 60 /// 61 /// Refers to an individual tag. 62 enum TagError { 63 EmptyString, /// 64 EmptyHexadecimalString, /// 65 NonPrintableString, /// 66 NonPrintableCharacter, /// 67 InvalidHexadecimalString, /// 68 ExpectedIntegerValue, /// 69 ExpectedStringValue, /// 70 InvalidValueType, /// 71 InvalidQualityString, /// 72 ExpectedStringWithSameLengthAsSequence /// 73 } 74 75 /// Designates pair of predefined key from SAM/BAM specification 76 /// and expected type of tags with that key. 77 struct TagType(string key, T) { 78 enum Key = key; 79 alias T Type; 80 } 81 82 /// Compile-time available information about predefined tags 83 alias TypeTuple!(TagType!("AM", int), 84 TagType!("AS", int), 85 TagType!("BC", string), 86 TagType!("BQ", string), 87 TagType!("CC", string), 88 TagType!("CM", int), 89 TagType!("CP", int), 90 TagType!("CQ", string), 91 TagType!("CS", string), 92 TagType!("E2", string), 93 TagType!("FI", int), 94 TagType!("FS", string), 95 TagType!("FZ", ushort[]), 96 TagType!("LB", string), 97 TagType!("H0", int), 98 TagType!("H1", int), 99 TagType!("H2", int), 100 TagType!("HI", int), 101 TagType!("IH", int), 102 TagType!("MD", string), 103 TagType!("MQ", int), 104 TagType!("NH", int), 105 TagType!("NM", int), 106 TagType!("OQ", string), 107 TagType!("OP", int), 108 TagType!("OC", string), 109 TagType!("PG", string), 110 TagType!("PQ", int), 111 TagType!("PU", string), 112 TagType!("Q2", string), 113 TagType!("R2", string), 114 TagType!("RG", string), 115 TagType!("SM", int), 116 TagType!("TC", int), 117 TagType!("U2", string), 118 TagType!("UQ", int)) 119 PredefinedTags; 120 121 122 private template GetKey(U) { 123 enum GetKey = U.Key; 124 } 125 126 private template PredefinedTagTypeHelper(string s) { 127 alias PredefinedTags[staticIndexOf!(s, staticMap!(GetKey, PredefinedTags))] PredefinedTagTypeHelper; 128 } 129 130 /// Get predefined tag type by its key, in compile-time. 131 template PredefinedTagType(string s) { 132 alias PredefinedTagTypeHelper!(s).Type PredefinedTagType; 133 } 134 135 /** 136 Abstract class encapsulating visitation of SAM header elements. 137 */ 138 abstract class AbstractAlignmentValidator { 139 /// Start validation process. 140 /// 141 /// Passing by reference is not only for doing less copying, 142 /// one might want to attempt to fix invalid fields 143 /// in onError() methods. 144 void validate(ref BamRead alignment) { 145 _visitAlignment(alignment); 146 } 147 148 /** Implement those methods to define your own behaviour. 149 150 During validation process, in case of an error the corresponding 151 method gets called, and is provided the object where the error occurred, 152 and type of the error. Objects are passed by reference so that they 153 can be changed (fixed / cleaned up / etc.) 154 155 If onError() returns true, that means to continue validation process 156 for this particular entity. Otherwise, all other validation checks are 157 skipped and next entity is processed. 158 */ 159 abstract bool onError(ref BamRead al, AlignmentError error); 160 abstract bool onError(ref BamRead al, CigarError error); /// ditto 161 abstract bool onError(string key, const ref Value value, TagError error); /// ditto 162 163 private: 164 165 // Method names are a bit misleading, 166 // their return value is NOT whether a field is invalid or not 167 // but rather whether onError() handlers decide to stop validation 168 // when the field is invalid. 169 170 bool invalidReadName(ref BamRead al) { 171 // Read name (a.k.a. QNAME) must =~ /^[!-?A-~]{1,255}$/ 172 // according to specification. 173 if (al.name.length == 0) { 174 if (!onError(al, AlignmentError.EmptyReadName)) return true; 175 } else if (al.name.length > 255) { 176 if (!onError(al, AlignmentError.TooLongReadName)) return true; 177 } else { 178 foreach (char c; al.name) 179 { 180 if ((c < '!') || (c > '~') || (c == '@')) { 181 if (!onError(al, AlignmentError.ReadNameContainsInvalidCharacters)) { 182 return true; 183 } else { 184 break; 185 } 186 } 187 } 188 } 189 return false; 190 } 191 192 bool invalidPosition(ref BamRead al) { 193 /// Check that position is in range [-1 .. 2^29 - 2] 194 if (al.position < -1 || al.position > ((1<<29) - 2)) { 195 if (!onError(al, AlignmentError.PositionIsOutOfRange)) { 196 return true; 197 } 198 } 199 return false; 200 } 201 202 bool invalidQualityData(ref BamRead al) { 203 /// Check quality data 204 if (!all!"a == 0xFF"(al.base_qualities) && 205 !all!"0 <= a && a <= 93"(al.base_qualities)) 206 { 207 if (!onError(al, AlignmentError.QualityDataContainsInvalidElements)) { 208 return true; 209 } 210 } 211 return false; 212 } 213 214 static bool internalHardClipping(ref BamRead al) { 215 return (al.cigar.length > 2 && 216 any!"a.type == 'H'"(al.cigar[1..$-1])); 217 } 218 219 static bool internalSoftClipping(ref BamRead al) { 220 if (al.cigar.length <= 2) return false; 221 222 auto cigar = al.cigar; 223 224 /// strip H operations from ends 225 if (cigar[0].type == 'H') { 226 cigar = cigar[1..$]; 227 } 228 if (cigar[$-1].type == 'H') { 229 cigar = cigar[0..$-1]; 230 } 231 232 /// check that S operations are at the ends only 233 return (cigar.length > 2 && 234 any!"a.type == 'S'"(cigar[1..$-1])); 235 } 236 237 // Sum of M/I/S/=/X operations must be equal to the sequence length 238 // if both sequence and CIGAR string are presented. 239 static bool inconsistentLength(ref BamRead al) { 240 return (al.sequence_length > 0 && 241 al.sequence_length != reduce!`a + b`(0, 242 map!`a.length`( 243 filter!`canFind("MIS=X", a.type)`( 244 al.cigar)))); 245 } 246 247 bool invalidCigar(ref BamRead al) { 248 249 if (al.cigar.length == 0) return false; 250 251 static string check(string s) { 252 import std.ascii : toUpper; 253 return (`if (`~s.dup~`(al)`~ 254 ` && !onError(al, CigarError.`~(cast(char)(s[0]-32))~s[1..$]~`)`~ 255 ` && (called_on_error || onError(al, AlignmentError.InvalidCigar)))`~ 256 `{`~ 257 ` return true;`~ 258 `}`).idup; 259 } 260 261 bool called_on_error = false; 262 263 mixin(check("internalHardClipping")); 264 mixin(check("internalSoftClipping")); 265 mixin(check("inconsistentLength")); 266 267 return false; 268 } 269 270 // Check tags, a lot of them are predefined in the specification 271 // and have to satisfy certain requirements. 272 bool invalidTags(ref BamRead al) { 273 274 bool all_tags_are_good = true; 275 276 void someTagIsBad() { 277 if (all_tags_are_good) { 278 if (!onError(al, AlignmentError.InvalidTags)) return; 279 } 280 all_tags_are_good = false; 281 } 282 283 /// Check that all tag keys are distinct. 284 285 bool all_distinct = true; 286 287 // Optimize for small number of tags 288 ushort[256] keys = void; 289 size_t i = 0; 290 291 // Check each tag in turn. 292 foreach (k, v; al) { 293 if (!isValid(k, v, al)) { 294 someTagIsBad(); 295 } 296 297 if (i < keys.length) { 298 keys[i] = *cast(ushort*)(k.ptr); 299 300 if (all_distinct) { 301 for (size_t j = 0; j < i; ++j) { 302 if (keys[i] == keys[j]) { 303 all_distinct = false; 304 break; 305 } 306 } 307 } 308 309 i += 1; 310 } else { 311 if (all_distinct) { 312 // must be exactly one 313 int found = 0; 314 foreach (k2, v2; al) { 315 if (*cast(ushort*)(k2.ptr) == *cast(ushort*)(k.ptr)) { 316 if (found == 1) { 317 all_distinct = false; 318 break; 319 } else { 320 ++found; 321 } 322 } 323 } 324 } 325 } 326 } 327 328 if (!all_distinct) { 329 if (!onError(al, AlignmentError.DuplicateTagKeys)) return true; 330 } 331 332 return false; 333 } 334 335 void _visitAlignment(ref BamRead al) { 336 if (invalidReadName(al)) return; 337 if (invalidPosition(al)) return; 338 if (invalidQualityData(al)) return; 339 if (invalidCigar(al)) return; 340 if (invalidTags(al)) return; 341 } 342 343 bool isValid(string key, const ref Value value, const ref BamRead al) { 344 345 bool result = true; 346 347 if (value.is_hexadecimal_string()) { 348 auto str = cast(string)value; 349 if (str.length == 0) { 350 if (!onError(key, value, TagError.EmptyHexadecimalString)) { 351 return false; 352 } 353 result = false; 354 } 355 /// check that it contains only 0..9a-fA-F characters 356 if (!all!(isHexDigit)(str)) { 357 if (!onError(key, value, TagError.InvalidHexadecimalString)) { 358 return false; 359 } 360 result = false; 361 } 362 } else if (value.is_character()) { 363 /// character must be [!-~] 364 auto c = cast(char)value; 365 if (!(c >= '!' && c <= '~')) { 366 if (!onError(key, value, TagError.NonPrintableCharacter)) { 367 return false; 368 } 369 result = false; 370 } 371 } else if (value.is_string()) { 372 auto str = cast(string)value; 373 if (str.length == 0) { 374 if (!onError(key, value, TagError.EmptyString)) { 375 return false; 376 } 377 result = false; 378 } 379 /// string must be [ !-~]+ 380 if (!all!"a >= ' ' && a <= '~'"(str)) { 381 if (!onError(key, value, TagError.NonPrintableString)) { 382 return false; 383 } 384 result = false; 385 } 386 } 387 388 /// check various tags from SAM/BAM specification 389 if (!additionalChecksIfTheTagIsPredefined(key, value, al)) { 390 result = false; 391 } 392 393 return result; 394 } 395 396 // There're some requirements for predefined tags to be checked 397 // such as type, length in some cases, or even some regular expression. 398 // See page 6 of SAM/BAM specification. 399 bool additionalChecksIfTheTagIsPredefined(string key, const ref Value value, 400 const ref BamRead al) 401 { 402 bool result = true; 403 404 // Creates a switch for all predefined tag keys. 405 string switchTagKey() { 406 char[] cs; 407 foreach (t; PredefinedTags) { 408 cs ~= `case "`~t.Key~`":`~ 409 ` if (!checkTagValue!"`~t.Key~`"(value, al)) {`~ 410 ` result = false;`~ 411 ` }`~ 412 ` break;`.dup; 413 } 414 return "switch (key) { " ~ cs.idup ~ " default : break; }"; 415 } 416 417 mixin(switchTagKey()); 418 419 return result; 420 } 421 422 // Supposed to be inlined in the above switch 423 bool checkTagValue(string s)(const ref Value value, const ref BamRead al) { 424 425 bool result = true; 426 427 /// 1. Check type. 428 429 static if (is(PredefinedTagType!s == int)) { 430 if (!value.is_integer) { 431 if (!onError(s, value, TagError.ExpectedIntegerValue)) { 432 return false; 433 } 434 result = false; 435 } 436 } else if (is(PredefinedTagType!s == string)) { 437 // Notice that there are no 'H'-typed predefined tags, 438 // and they are almost unused and therefore are not likely 439 // to appear in the future. 440 if (!value.is_string || value.bam_typeid == 'H') { 441 if (!onError(s, value, TagError.ExpectedStringValue)) { 442 return false; 443 } 444 result = false; 445 } 446 } else { 447 if (value.tag != GetTypeId!(PredefinedTagType!s)) { 448 if (!onError(s, value, TagError.InvalidValueType)) { 449 return false; 450 } 451 result = false; 452 } 453 } 454 455 /// 2. For tags which contain quality as a string, 456 /// check that all characters are valid 457 458 static if (staticIndexOf!(s, "CQ", "E2", "OQ", "Q2", "U2") != -1) { 459 auto str = cast(string)value; 460 if (str != "*" && !all!"a >= '!' && a <= '~'"(str)) { 461 if (!onError(s, value, TagError.InvalidQualityString)) { 462 return false; 463 } 464 result = false; 465 } 466 } 467 468 /// 3. In a couple of cases values are required to be 469 /// of the same length as the read sequence. 470 471 static if (staticIndexOf!(s, "BQ", "E2") != -1) { 472 if ((cast(string)value).length != al.sequence_length) { 473 if (!onError(s, value, TagError.ExpectedStringWithSameLengthAsSequence)) { 474 return false; 475 } 476 } 477 } 478 479 480 /// 4. MD tag ought to: a) match /^[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*$/ 481 /// b) match CIGAR string (TODO?) 482 483 static if (s == "MD") { 484 485 /// a) check regular expression 486 487 auto s = cast(string)value; 488 bool valid = true; 489 if (s.length == 0) valid = false; 490 if (!isDigit(s[0])) valid = false; 491 size_t i = 1; 492 while (i < s.length && isDigit(s[i])) 493 ++i; 494 while (i < s.length) { 495 if (isUpper(s[i])) { 496 ++i; // [A-Z] 497 } else if (s[i] == '^') { // ^[A-Z]+ 498 ++i; 499 if (i == s.length || !isUpper(s[i])) { 500 valid = false; 501 break; 502 } 503 while (i < s.length && isUpper(s[i])) 504 ++i; 505 } else { 506 valid = false; 507 break; 508 } 509 // now [0-9]+ 510 if (i == s.length || !isDigit(s[i])) { 511 valid = false; 512 break; 513 } 514 while (i < s.length && isDigit(s[i])) 515 ++i; 516 } 517 518 if (i < s.length) { 519 valid = false; 520 } 521 522 if (!valid) result = false; 523 } 524 525 return result; 526 } 527 } 528 529 final private class BooleanValidator : AbstractAlignmentValidator { 530 531 bool result; 532 533 override void validate(ref BamRead al) { 534 result = true; 535 super.validate(al); 536 } 537 538 override bool onError(ref BamRead al, AlignmentError e) { 539 return (result = false); 540 } 541 542 override bool onError(ref BamRead al, CigarError e) { 543 return (result = false); 544 } 545 546 override bool onError(string key, const ref Value val, TagError e) { 547 return (result = false); 548 } 549 550 } 551 552 private static BooleanValidator booleanValidator; 553 554 static this() { 555 booleanValidator = new BooleanValidator(); 556 } 557 558 /// Check if alignment is valid 559 bool isValid(BamRead alignment) { 560 booleanValidator.validate(alignment); 561 return booleanValidator.result; 562 }