1 /*
2 This file is part of BioD.
3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
4
5 Permission is hereby granted, free of charge, to any person obtaining a
6 copy of this software and associated documentation files (the "Software"),
7 to deal in the Software without restriction, including without limitation
8 the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 and/or sell copies of the Software, and to permit persons to whom the
10 Software is furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 DEALINGS IN THE SOFTWARE.
22
23 */
24 module bio.std.hts.bam.validation.alignment;
25
26 public import bio.std.hts.bam.read;
27 public import bio.std.hts.bam.tagvalue;
28 import bio.core.utils.algo;
29
30 import std.algorithm;
31 import std.ascii;
32 import std.conv;
33 import std.typetuple;
34
35 /**
36 Alignment validation error types.
37
38 InvalidCigar error is accompanied by some CigarError,
39 InvalidTags is accompanied by some TagError.
40 */
41 enum AlignmentError {
42 EmptyReadName, ///
43 TooLongReadName, ///
44 ReadNameContainsInvalidCharacters, ///
45 PositionIsOutOfRange, ///
46 QualityDataContainsInvalidElements, ///
47 InvalidCigar, ///
48 InvalidTags, ///
49 DuplicateTagKeys ///
50 }
51
52 /// CIGAR string validation error types.
53 enum CigarError {
54 InternalHardClipping, ///
55 InternalSoftClipping, ///
56 InconsistentLength ///
57 }
58
59 /// Auxiliary data validation error types.
60 ///
61 /// Refers to an individual tag.
62 enum TagError {
63 EmptyString, ///
64 EmptyHexadecimalString, ///
65 NonPrintableString, ///
66 NonPrintableCharacter, ///
67 InvalidHexadecimalString, ///
68 ExpectedIntegerValue, ///
69 ExpectedStringValue, ///
70 InvalidValueType, ///
71 InvalidQualityString, ///
72 ExpectedStringWithSameLengthAsSequence ///
73 }
74
75 /// Designates pair of predefined key from SAM/BAM specification
76 /// and expected type of tags with that key.
77 struct TagType(string key, T) {
78 enum Key = key;
79 alias T Type;
80 }
81
82 /// Compile-time available information about predefined tags
83 alias TypeTuple!(TagType!("AM", int),
84 TagType!("AS", int),
85 TagType!("BC", string),
86 TagType!("BQ", string),
87 TagType!("CC", string),
88 TagType!("CM", int),
89 TagType!("CP", int),
90 TagType!("CQ", string),
91 TagType!("CS", string),
92 TagType!("E2", string),
93 TagType!("FI", int),
94 TagType!("FS", string),
95 TagType!("FZ", ushort[]),
96 TagType!("LB", string),
97 TagType!("H0", int),
98 TagType!("H1", int),
99 TagType!("H2", int),
100 TagType!("HI", int),
101 TagType!("IH", int),
102 TagType!("MD", string),
103 TagType!("MQ", int),
104 TagType!("NH", int),
105 TagType!("NM", int),
106 TagType!("OQ", string),
107 TagType!("OP", int),
108 TagType!("OC", string),
109 TagType!("PG", string),
110 TagType!("PQ", int),
111 TagType!("PU", string),
112 TagType!("Q2", string),
113 TagType!("R2", string),
114 TagType!("RG", string),
115 TagType!("SM", int),
116 TagType!("TC", int),
117 TagType!("U2", string),
118 TagType!("UQ", int))
119 PredefinedTags;
120
121
122 private template GetKey(U) {
123 enum GetKey = U.Key;
124 }
125
126 private template PredefinedTagTypeHelper(string s) {
127 alias PredefinedTags[staticIndexOf!(s, staticMap!(GetKey, PredefinedTags))] PredefinedTagTypeHelper;
128 }
129
130 /// Get predefined tag type by its key, in compile-time.
131 template PredefinedTagType(string s) {
132 alias PredefinedTagTypeHelper!(s).Type PredefinedTagType;
133 }
134
135 /**
136 Abstract class encapsulating visitation of SAM header elements.
137 */
138 abstract class AbstractAlignmentValidator {
139 /// Start validation process.
140 ///
141 /// Passing by reference is not only for doing less copying,
142 /// one might want to attempt to fix invalid fields
143 /// in onError() methods.
144 void validate(ref BamRead alignment) {
145 _visitAlignment(alignment);
146 }
147
148 /** Implement those methods to define your own behaviour.
149
150 During validation process, in case of an error the corresponding
151 method gets called, and is provided the object where the error occurred,
152 and type of the error. Objects are passed by reference so that they
153 can be changed (fixed / cleaned up / etc.)
154
155 If onError() returns true, that means to continue validation process
156 for this particular entity. Otherwise, all other validation checks are
157 skipped and next entity is processed.
158 */
159 abstract bool onError(ref BamRead al, AlignmentError error);
160 abstract bool onError(ref BamRead al, CigarError error); /// ditto
161 abstract bool onError(string key, const ref Value value, TagError error); /// ditto
162
163 private:
164
165 // Method names are a bit misleading,
166 // their return value is NOT whether a field is invalid or not
167 // but rather whether onError() handlers decide to stop validation
168 // when the field is invalid.
169
170 bool invalidReadName(ref BamRead al) {
171 // Read name (a.k.a. QNAME) must =~ /^[!-?A-~]{1,255}$/
172 // according to specification.
173 if (al.name.length == 0) {
174 if (!onError(al, AlignmentError.EmptyReadName)) return true;
175 } else if (al.name.length > 255) {
176 if (!onError(al, AlignmentError.TooLongReadName)) return true;
177 } else {
178 foreach (char c; al.name)
179 {
180 if ((c < '!') || (c > '~') || (c == '@')) {
181 if (!onError(al, AlignmentError.ReadNameContainsInvalidCharacters)) {
182 return true;
183 } else {
184 break;
185 }
186 }
187 }
188 }
189 return false;
190 }
191
192 bool invalidPosition(ref BamRead al) {
193 /// Check that position is in range [-1 .. 2^29 - 2]
194 if (al.position < -1 || al.position > ((1<<29) - 2)) {
195 if (!onError(al, AlignmentError.PositionIsOutOfRange)) {
196 return true;
197 }
198 }
199 return false;
200 }
201
202 bool invalidQualityData(ref BamRead al) {
203 /// Check quality data
204 if (!all!"a == 0xFF"(al.base_qualities) &&
205 !all!"0 <= a && a <= 93"(al.base_qualities))
206 {
207 if (!onError(al, AlignmentError.QualityDataContainsInvalidElements)) {
208 return true;
209 }
210 }
211 return false;
212 }
213
214 static bool internalHardClipping(ref BamRead al) {
215 return (al.cigar.length > 2 &&
216 any!"a.type == 'H'"(al.cigar[1..$-1]));
217 }
218
219 static bool internalSoftClipping(ref BamRead al) {
220 if (al.cigar.length <= 2) return false;
221
222 auto cigar = al.cigar;
223
224 /// strip H operations from ends
225 if (cigar[0].type == 'H') {
226 cigar = cigar[1..$];
227 }
228 if (cigar[$-1].type == 'H') {
229 cigar = cigar[0..$-1];
230 }
231
232 /// check that S operations are at the ends only
233 return (cigar.length > 2 &&
234 any!"a.type == 'S'"(cigar[1..$-1]));
235 }
236
237 // Sum of M/I/S/=/X operations must be equal to the sequence length
238 // if both sequence and CIGAR string are presented.
239 static bool inconsistentLength(ref BamRead al) {
240 return (al.sequence_length > 0 &&
241 al.sequence_length != reduce!`a + b`(0,
242 map!`a.length`(
243 filter!`canFind("MIS=X", a.type)`(
244 al.cigar))));
245 }
246
247 bool invalidCigar(ref BamRead al) {
248
249 if (al.cigar.length == 0) return false;
250
251 static string check(string s) {
252 import std.ascii : toUpper;
253 return (`if (`~s.dup~`(al)`~
254 ` && !onError(al, CigarError.`~(cast(char)(s[0]-32))~s[1..$]~`)`~
255 ` && (called_on_error || onError(al, AlignmentError.InvalidCigar)))`~
256 `{`~
257 ` return true;`~
258 `}`).idup;
259 }
260
261 bool called_on_error = false;
262
263 mixin(check("internalHardClipping"));
264 mixin(check("internalSoftClipping"));
265 mixin(check("inconsistentLength"));
266
267 return false;
268 }
269
270 // Check tags, a lot of them are predefined in the specification
271 // and have to satisfy certain requirements.
272 bool invalidTags(ref BamRead al) {
273
274 bool all_tags_are_good = true;
275
276 void someTagIsBad() {
277 if (all_tags_are_good) {
278 if (!onError(al, AlignmentError.InvalidTags)) return;
279 }
280 all_tags_are_good = false;
281 }
282
283 /// Check that all tag keys are distinct.
284
285 bool all_distinct = true;
286
287 // Optimize for small number of tags
288 ushort[256] keys = void;
289 size_t i = 0;
290
291 // Check each tag in turn.
292 foreach (k, v; al) {
293 if (!isValid(k, v, al)) {
294 someTagIsBad();
295 }
296
297 if (i < keys.length) {
298 keys[i] = *cast(ushort*)(k.ptr);
299
300 if (all_distinct) {
301 for (size_t j = 0; j < i; ++j) {
302 if (keys[i] == keys[j]) {
303 all_distinct = false;
304 break;
305 }
306 }
307 }
308
309 i += 1;
310 } else {
311 if (all_distinct) {
312 // must be exactly one
313 int found = 0;
314 foreach (k2, v2; al) {
315 if (*cast(ushort*)(k2.ptr) == *cast(ushort*)(k.ptr)) {
316 if (found == 1) {
317 all_distinct = false;
318 break;
319 } else {
320 ++found;
321 }
322 }
323 }
324 }
325 }
326 }
327
328 if (!all_distinct) {
329 if (!onError(al, AlignmentError.DuplicateTagKeys)) return true;
330 }
331
332 return false;
333 }
334
335 void _visitAlignment(ref BamRead al) {
336 if (invalidReadName(al)) return;
337 if (invalidPosition(al)) return;
338 if (invalidQualityData(al)) return;
339 if (invalidCigar(al)) return;
340 if (invalidTags(al)) return;
341 }
342
343 bool isValid(string key, const ref Value value, const ref BamRead al) {
344
345 bool result = true;
346
347 if (value.is_hexadecimal_string()) {
348 auto str = cast(string)value;
349 if (str.length == 0) {
350 if (!onError(key, value, TagError.EmptyHexadecimalString)) {
351 return false;
352 }
353 result = false;
354 }
355 /// check that it contains only 0..9a-fA-F characters
356 if (!all!(isHexDigit)(str)) {
357 if (!onError(key, value, TagError.InvalidHexadecimalString)) {
358 return false;
359 }
360 result = false;
361 }
362 } else if (value.is_character()) {
363 /// character must be [!-~]
364 auto c = cast(char)value;
365 if (!(c >= '!' && c <= '~')) {
366 if (!onError(key, value, TagError.NonPrintableCharacter)) {
367 return false;
368 }
369 result = false;
370 }
371 } else if (value.is_string()) {
372 auto str = cast(string)value;
373 if (str.length == 0) {
374 if (!onError(key, value, TagError.EmptyString)) {
375 return false;
376 }
377 result = false;
378 }
379 /// string must be [ !-~]+
380 if (!all!"a >= ' ' && a <= '~'"(str)) {
381 if (!onError(key, value, TagError.NonPrintableString)) {
382 return false;
383 }
384 result = false;
385 }
386 }
387
388 /// check various tags from SAM/BAM specification
389 if (!additionalChecksIfTheTagIsPredefined(key, value, al)) {
390 result = false;
391 }
392
393 return result;
394 }
395
396 // There're some requirements for predefined tags to be checked
397 // such as type, length in some cases, or even some regular expression.
398 // See page 6 of SAM/BAM specification.
399 bool additionalChecksIfTheTagIsPredefined(string key, const ref Value value,
400 const ref BamRead al)
401 {
402 bool result = true;
403
404 // Creates a switch for all predefined tag keys.
405 string switchTagKey() {
406 char[] cs;
407 foreach (t; PredefinedTags) {
408 cs ~= `case "`~t.Key~`":`~
409 ` if (!checkTagValue!"`~t.Key~`"(value, al)) {`~
410 ` result = false;`~
411 ` }`~
412 ` break;`.dup;
413 }
414 return "switch (key) { " ~ cs.idup ~ " default : break; }";
415 }
416
417 mixin(switchTagKey());
418
419 return result;
420 }
421
422 // Supposed to be inlined in the above switch
423 bool checkTagValue(string s)(const ref Value value, const ref BamRead al) {
424
425 bool result = true;
426
427 /// 1. Check type.
428
429 static if (is(PredefinedTagType!s == int)) {
430 if (!value.is_integer) {
431 if (!onError(s, value, TagError.ExpectedIntegerValue)) {
432 return false;
433 }
434 result = false;
435 }
436 } else if (is(PredefinedTagType!s == string)) {
437 // Notice that there are no 'H'-typed predefined tags,
438 // and they are almost unused and therefore are not likely
439 // to appear in the future.
440 if (!value.is_string || value.bam_typeid == 'H') {
441 if (!onError(s, value, TagError.ExpectedStringValue)) {
442 return false;
443 }
444 result = false;
445 }
446 } else {
447 if (value.tag != GetTypeId!(PredefinedTagType!s)) {
448 if (!onError(s, value, TagError.InvalidValueType)) {
449 return false;
450 }
451 result = false;
452 }
453 }
454
455 /// 2. For tags which contain quality as a string,
456 /// check that all characters are valid
457
458 static if (staticIndexOf!(s, "CQ", "E2", "OQ", "Q2", "U2") != -1) {
459 auto str = cast(string)value;
460 if (str != "*" && !all!"a >= '!' && a <= '~'"(str)) {
461 if (!onError(s, value, TagError.InvalidQualityString)) {
462 return false;
463 }
464 result = false;
465 }
466 }
467
468 /// 3. In a couple of cases values are required to be
469 /// of the same length as the read sequence.
470
471 static if (staticIndexOf!(s, "BQ", "E2") != -1) {
472 if ((cast(string)value).length != al.sequence_length) {
473 if (!onError(s, value, TagError.ExpectedStringWithSameLengthAsSequence)) {
474 return false;
475 }
476 }
477 }
478
479
480 /// 4. MD tag ought to: a) match /^[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*$/
481 /// b) match CIGAR string (TODO?)
482
483 static if (s == "MD") {
484
485 /// a) check regular expression
486
487 auto s = cast(string)value;
488 bool valid = true;
489 if (s.length == 0) valid = false;
490 if (!isDigit(s[0])) valid = false;
491 size_t i = 1;
492 while (i < s.length && isDigit(s[i]))
493 ++i;
494 while (i < s.length) {
495 if (isUpper(s[i])) {
496 ++i; // [A-Z]
497 } else if (s[i] == '^') { // ^[A-Z]+
498 ++i;
499 if (i == s.length || !isUpper(s[i])) {
500 valid = false;
501 break;
502 }
503 while (i < s.length && isUpper(s[i]))
504 ++i;
505 } else {
506 valid = false;
507 break;
508 }
509 // now [0-9]+
510 if (i == s.length || !isDigit(s[i])) {
511 valid = false;
512 break;
513 }
514 while (i < s.length && isDigit(s[i]))
515 ++i;
516 }
517
518 if (i < s.length) {
519 valid = false;
520 }
521
522 if (!valid) result = false;
523 }
524
525 return result;
526 }
527 }
528
529 final private class BooleanValidator : AbstractAlignmentValidator {
530
531 bool result;
532
533 override void validate(ref BamRead al) {
534 result = true;
535 super.validate(al);
536 }
537
538 override bool onError(ref BamRead al, AlignmentError e) {
539 return (result = false);
540 }
541
542 override bool onError(ref BamRead al, CigarError e) {
543 return (result = false);
544 }
545
546 override bool onError(string key, const ref Value val, TagError e) {
547 return (result = false);
548 }
549
550 }
551
552 private static BooleanValidator booleanValidator;
553
554 static this() {
555 booleanValidator = new BooleanValidator();
556 }
557
558 /// Check if alignment is valid
559 bool isValid(BamRead alignment) {
560 booleanValidator.validate(alignment);
561 return booleanValidator.result;
562 }