1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.std.hts.bam.validation.alignment;
25 
26 public import bio.std.hts.bam.read;
27 public import bio.std.hts.bam.tagvalue;
28 import bio.core.utils.algo;
29 
30 import std.algorithm;
31 import std.ascii;
32 import std.conv;
33 import std.typetuple;
34 
35 /**
36     Alignment validation error types.
37 
38     InvalidCigar error is accompanied by some CigarError,
39     InvalidTags is accompanied by some TagError.
40 */
41 enum AlignmentError {
42     EmptyReadName, ///
43     TooLongReadName, /// 
44     ReadNameContainsInvalidCharacters, ///
45     PositionIsOutOfRange, /// 
46     QualityDataContainsInvalidElements, ///
47     InvalidCigar, ///
48     InvalidTags, ///
49     DuplicateTagKeys ///
50 }
51 
52 /// CIGAR string validation error types.
53 enum CigarError {
54     InternalHardClipping, /// 
55     InternalSoftClipping, ///
56     InconsistentLength ///
57 }
58 
59 /// Auxiliary data validation error types.
60 ///
61 /// Refers to an individual tag.
62 enum TagError {
63     EmptyString, ///
64     EmptyHexadecimalString, ///
65     NonPrintableString, ///
66     NonPrintableCharacter, ///
67     InvalidHexadecimalString, ///
68     ExpectedIntegerValue, ///
69     ExpectedStringValue, ///
70     InvalidValueType, ///
71     InvalidQualityString, ///
72     ExpectedStringWithSameLengthAsSequence ///
73 }
74 
75 /// Designates pair of predefined key from SAM/BAM specification
76 /// and expected type of tags with that key.
77 struct TagType(string key, T) {
78     enum Key = key;
79     alias T Type;
80 }
81 
82 /// Compile-time available information about predefined tags
83 alias TypeTuple!(TagType!("AM", int),
84                  TagType!("AS", int),
85                  TagType!("BC", string),
86                  TagType!("BQ", string),
87                  TagType!("CC", string),
88                  TagType!("CM", int),
89                  TagType!("CP", int),
90                  TagType!("CQ", string),
91                  TagType!("CS", string),
92                  TagType!("E2", string),
93                  TagType!("FI", int),
94                  TagType!("FS", string),
95                  TagType!("FZ", ushort[]),
96                  TagType!("LB", string),
97                  TagType!("H0", int),
98                  TagType!("H1", int),
99                  TagType!("H2", int),
100                  TagType!("HI", int),
101                  TagType!("IH", int),
102                  TagType!("MD", string),
103                  TagType!("MQ", int),
104                  TagType!("NH", int),
105                  TagType!("NM", int),
106                  TagType!("OQ", string),
107                  TagType!("OP", int),
108                  TagType!("OC", string),
109                  TagType!("PG", string),
110                  TagType!("PQ", int),
111                  TagType!("PU", string),
112                  TagType!("Q2", string),
113                  TagType!("R2", string),
114                  TagType!("RG", string),
115                  TagType!("SM", int),
116                  TagType!("TC", int),
117                  TagType!("U2", string),
118                  TagType!("UQ", int))
119     PredefinedTags;
120 
121 
122 private template GetKey(U) {
123     enum GetKey = U.Key;
124 }
125 
126 private template PredefinedTagTypeHelper(string s) {
127     alias PredefinedTags[staticIndexOf!(s, staticMap!(GetKey, PredefinedTags))] PredefinedTagTypeHelper;
128 }
129 
130 /// Get predefined tag type by its key, in compile-time.
131 template PredefinedTagType(string s) {
132     alias PredefinedTagTypeHelper!(s).Type PredefinedTagType;
133 }
134 
135 /**
136   Abstract class encapsulating visitation of SAM header elements.
137 */
138 abstract class AbstractAlignmentValidator {
139     /// Start validation process.
140     ///
141     /// Passing by reference is not only for doing less copying, 
142     /// one might want to attempt to fix invalid fields 
143     /// in onError() methods.
144     void validate(ref BamRead alignment) {
145         _visitAlignment(alignment);
146     }
147 
148     /** Implement those methods to define your own behaviour.
149 
150         During validation process, in case of an error the corresponding
151         method gets called, and is provided the object where the error occurred,
152         and type of the error. Objects are passed by reference so that they
153         can be changed (fixed / cleaned up / etc.)
154 
155         If onError() returns true, that means to continue validation process
156         for this particular entity. Otherwise, all other validation checks are
157         skipped and next entity is processed.
158     */
159     abstract bool onError(ref BamRead al, AlignmentError error); 
160     abstract bool onError(ref BamRead al, CigarError error); /// ditto
161     abstract bool onError(string key, const ref Value value, TagError error); /// ditto
162 
163 private:
164 
165     // Method names are a bit misleading,
166     // their return value is NOT whether a field is invalid or not
167     // but rather whether onError() handlers decide to stop validation
168     // when the field is invalid.
169 
170     bool invalidReadName(ref BamRead al) {
171         // Read name (a.k.a. QNAME) must =~ /^[!-?A-~]{1,255}$/ 
172         // according to specification.
173         if (al.name.length == 0) {
174             if (!onError(al, AlignmentError.EmptyReadName)) return true;
175         } else if (al.name.length > 255) {
176             if (!onError(al, AlignmentError.TooLongReadName)) return true;
177         } else {
178             foreach (char c; al.name) 
179             {
180                 if ((c < '!') || (c > '~') || (c == '@')) {
181                     if (!onError(al, AlignmentError.ReadNameContainsInvalidCharacters)) {
182                         return true;
183                     } else {
184                         break;
185                     }
186                 }
187             }
188         }
189         return false;
190     }
191 
192     bool invalidPosition(ref BamRead al) {
193         /// Check that position is in range [-1 .. 2^29 - 2]
194         if (al.position < -1 || al.position > ((1<<29) - 2)) {
195             if (!onError(al, AlignmentError.PositionIsOutOfRange)) {
196                 return true;
197             }
198         }
199         return false;
200     }
201 
202     bool invalidQualityData(ref BamRead al) {
203         /// Check quality data
204         if (!all!"a == 0xFF"(al.base_qualities) &&
205             !all!"0 <= a && a <= 93"(al.base_qualities)) 
206         {
207             if (!onError(al, AlignmentError.QualityDataContainsInvalidElements)) {
208                 return true;
209             }
210         }
211         return false;
212     }
213 
214     static bool internalHardClipping(ref BamRead al) {
215         return (al.cigar.length > 2 && 
216                 any!"a.type == 'H'"(al.cigar[1..$-1]));
217     }
218 
219     static bool internalSoftClipping(ref BamRead al) {
220         if (al.cigar.length <= 2) return false;
221 
222         auto cigar = al.cigar;
223 
224         /// strip H operations from ends
225         if (cigar[0].type == 'H') {
226             cigar = cigar[1..$];
227         }
228         if (cigar[$-1].type == 'H') {
229             cigar = cigar[0..$-1];
230         }
231 
232         /// check that S operations are at the ends only
233         return (cigar.length > 2 &&
234                 any!"a.type == 'S'"(cigar[1..$-1]));
235     } 
236 
237     //  Sum of M/I/S/=/X operations must be equal to the sequence length
238     //  if both sequence and CIGAR string are presented.
239     static bool inconsistentLength(ref BamRead al) {
240         return (al.sequence_length > 0 &&
241                 al.sequence_length != reduce!`a + b`(0, 
242                                         map!`a.length`(
243                                           filter!`canFind("MIS=X", a.type)`(
244                                             al.cigar))));
245     }
246  
247     bool invalidCigar(ref BamRead al) {
248         
249         if (al.cigar.length == 0) return false;
250 
251         static string check(string s) {
252             import std.ascii : toUpper;
253             return (`if (`~s.dup~`(al)`~
254                    `    && !onError(al, CigarError.`~(cast(char)(s[0]-32))~s[1..$]~`)`~
255                    `    && (called_on_error || onError(al, AlignmentError.InvalidCigar)))`~
256                    `{`~
257                    `    return true;`~
258                    `}`).idup;
259         }
260 
261         bool called_on_error = false;
262 
263         mixin(check("internalHardClipping"));
264         mixin(check("internalSoftClipping"));
265         mixin(check("inconsistentLength"));
266 
267         return false;
268     }
269    
270     // Check tags, a lot of them are predefined in the specification
271     // and have to satisfy certain requirements.
272     bool invalidTags(ref BamRead al) {
273 
274         bool all_tags_are_good = true;
275         
276         void someTagIsBad() {
277             if (all_tags_are_good) {
278                 if (!onError(al, AlignmentError.InvalidTags)) return;
279             }
280             all_tags_are_good = false;
281         }
282 
283         /// Check that all tag keys are distinct.
284 
285         bool all_distinct = true;
286 
287         // Optimize for small number of tags
288         ushort[256] keys = void;
289         size_t i = 0;
290 
291         // Check each tag in turn.
292         foreach (k, v; al) {
293             if (!isValid(k, v, al)) {
294                 someTagIsBad();
295             }
296            
297             if (i < keys.length) {
298                 keys[i] = *cast(ushort*)(k.ptr);
299 
300                 if (all_distinct) {
301                     for (size_t j = 0; j < i; ++j) {
302                         if (keys[i] == keys[j]) {
303                             all_distinct = false;
304                             break;
305                         }
306                     }
307                 }
308 
309                 i += 1;
310             } else {
311                 if (all_distinct) {
312                     // must be exactly one
313                     int found = 0;
314                     foreach (k2, v2; al) {
315                         if (*cast(ushort*)(k2.ptr) == *cast(ushort*)(k.ptr)) {
316                             if (found == 1) {
317                                 all_distinct = false;
318                                 break;
319                             } else {
320                                 ++found;
321                             }
322                         }
323                     }
324                 }
325             }
326         }
327        
328         if (!all_distinct) {
329             if (!onError(al, AlignmentError.DuplicateTagKeys)) return true;
330         }
331 
332         return false;
333     }
334 
335     void _visitAlignment(ref BamRead al) {
336         if (invalidReadName(al)) return;
337         if (invalidPosition(al)) return;
338         if (invalidQualityData(al)) return;
339         if (invalidCigar(al)) return;
340         if (invalidTags(al)) return;
341     }
342 
343     bool isValid(string key, const ref Value value, const ref BamRead al) {
344 
345         bool result = true;
346 
347         if (value.is_hexadecimal_string()) {
348             auto str = cast(string)value;
349             if (str.length == 0) {
350                 if (!onError(key, value, TagError.EmptyHexadecimalString)) {
351                     return false;
352                 }
353                 result = false;
354             }
355             /// check that it contains only 0..9a-fA-F characters
356             if (!all!(isHexDigit)(str)) {
357                 if (!onError(key, value, TagError.InvalidHexadecimalString)) {
358                     return false;
359                 }
360                 result = false;
361             }
362         } else if (value.is_character()) {
363             /// character must be [!-~]
364             auto c = cast(char)value;
365             if (!(c >= '!' && c <= '~')) {
366                 if (!onError(key, value, TagError.NonPrintableCharacter)) {
367                     return false;
368                 }
369                 result = false;
370             }
371         } else if (value.is_string()) {
372             auto str = cast(string)value; 
373             if (str.length == 0) {
374                 if (!onError(key, value, TagError.EmptyString)) {
375                     return false;
376                 }
377                 result = false;
378             }
379             /// string must be [ !-~]+
380             if (!all!"a >= ' ' && a <= '~'"(str)) {
381                 if (!onError(key, value, TagError.NonPrintableString)) {
382                     return false;
383                 }
384                 result = false;
385             }
386         }
387 
388         /// check various tags from SAM/BAM specification
389         if (!additionalChecksIfTheTagIsPredefined(key, value, al)) {
390             result = false;
391         }
392 
393         return result;
394     }
395 
396     // There're some requirements for predefined tags to be checked
397     // such as type, length in some cases, or even some regular expression.
398     // See page 6 of SAM/BAM specification.
399     bool additionalChecksIfTheTagIsPredefined(string key, const ref Value value,
400                                               const ref BamRead al) 
401     {
402         bool result = true;
403 
404         // Creates a switch for all predefined tag keys.
405         string switchTagKey() {
406             char[] cs;
407             foreach (t; PredefinedTags) {
408                 cs ~= `case "`~t.Key~`":`~
409                       `  if (!checkTagValue!"`~t.Key~`"(value, al)) {`~
410                       `    result = false;`~
411                       `  }`~
412                       `  break;`.dup;
413             }
414             return "switch (key) { " ~ cs.idup ~ " default : break; }";
415         }
416 
417         mixin(switchTagKey());
418 
419         return result;
420     }
421 
422     // Supposed to be inlined in the above switch
423     bool checkTagValue(string s)(const ref Value value, const ref BamRead al) {
424 
425         bool result = true;
426         
427         /// 1. Check type.
428 
429         static if (is(PredefinedTagType!s == int)) {
430             if (!value.is_integer) {
431                 if (!onError(s, value, TagError.ExpectedIntegerValue)) {
432                     return false;
433                 }
434                 result = false;
435             }
436         } else if (is(PredefinedTagType!s == string)) {
437             // Notice that there are no 'H'-typed predefined tags,
438             // and they are almost unused and therefore are not likely
439             // to appear in the future. 
440             if (!value.is_string || value.bam_typeid == 'H') {
441                 if (!onError(s, value, TagError.ExpectedStringValue)) {
442                     return false;
443                 }
444                 result = false;
445             }
446         } else {
447             if (value.tag != GetTypeId!(PredefinedTagType!s)) {
448                 if (!onError(s, value, TagError.InvalidValueType)) {
449                     return false;
450                 }
451                 result = false;
452             }
453         }
454         
455         /// 2. For tags which contain quality as a string,
456         ///    check that all characters are valid 
457         
458         static if (staticIndexOf!(s, "CQ", "E2", "OQ", "Q2", "U2") != -1) {
459             auto str = cast(string)value;
460             if (str != "*" && !all!"a >= '!' && a <= '~'"(str)) {
461                 if (!onError(s, value, TagError.InvalidQualityString)) {
462                     return false;
463                 }
464                 result = false;
465             }
466         }
467 
468         /// 3. In a couple of cases values are required to be 
469         ///    of the same length as the read sequence.
470 
471         static if (staticIndexOf!(s, "BQ", "E2") != -1) {
472             if ((cast(string)value).length != al.sequence_length) {
473                 if (!onError(s, value, TagError.ExpectedStringWithSameLengthAsSequence)) {
474                     return false;
475                 }
476             }
477         }
478 
479 
480         /// 4. MD tag ought to: a) match /^[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*$/
481         ///                     b) match CIGAR string (TODO?)
482         
483         static if (s == "MD") {
484 
485             /// a) check regular expression
486 
487             auto s = cast(string)value;
488             bool valid = true;
489             if (s.length == 0) valid = false;
490             if (!isDigit(s[0])) valid = false;
491             size_t i = 1;
492             while (i < s.length && isDigit(s[i])) 
493                 ++i;
494             while (i < s.length) {
495                 if (isUpper(s[i])) {
496                     ++i; // [A-Z]
497                 } else if (s[i] == '^') { // ^[A-Z]+
498                     ++i;
499                     if (i == s.length || !isUpper(s[i])) {
500                         valid = false;
501                         break;
502                     }
503                     while (i < s.length && isUpper(s[i]))
504                         ++i;
505                 } else {
506                     valid = false;
507                     break;
508                 }
509                 // now [0-9]+
510                 if (i == s.length || !isDigit(s[i])) {
511                     valid = false;
512                     break;
513                 }
514                 while (i < s.length && isDigit(s[i]))
515                     ++i;
516             }
517 
518             if (i < s.length) {
519                 valid = false;
520             }
521 
522             if (!valid) result = false;
523         }
524 
525         return result;
526     }
527 }
528 
529 final private class BooleanValidator : AbstractAlignmentValidator {
530 
531     bool result;
532 
533     override void validate(ref BamRead al) {
534         result = true;
535         super.validate(al);
536     }
537 
538     override bool onError(ref BamRead al, AlignmentError e) {
539         return (result = false);
540     }
541 
542     override bool onError(ref BamRead al, CigarError e) {
543         return (result = false);
544     }
545 
546     override bool onError(string key, const ref Value val, TagError e) {
547         return (result = false);
548     }
549 
550 }
551 
552 private static BooleanValidator booleanValidator;
553 
554 static this() {
555     booleanValidator = new BooleanValidator();
556 }
557 
558 /// Check if alignment is valid
559 bool isValid(BamRead alignment) { 
560     booleanValidator.validate(alignment);
561     return booleanValidator.result;
562 }