1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 /**
25   Module for SAM header validation.
26   
27   In order to implement your own validation behaviour,
28   subclass AbstractSamHeaderValidator and define your own 
29   onError() methods.
30 */
31 module bio.std.hts.bam.validation.samheader;
32 
33 public import bio.std.hts.sam.header;
34 import bio.core.utils.algo;
35 
36 import std.algorithm;
37 import std.functional;
38 import std.ascii;
39 
40 /// SAM header validation error types.
41 ///
42 /// Each Invalid??Line error is accompanied by 
43 /// corresponding ??LineError.
44 enum SamHeaderError {
45     InvalidSqLine,
46     InvalidPgLine,
47     InvalidRgLine,
48     InvalidFormatVersion
49 }
50 
51 /// @SQ line validation error types.
52 enum SqLineError {
53     MissingSequenceName,
54     InvalidSequenceName,
55     SequenceLengthOutOfRange
56 }
57 
58 /// @RG line validation error types.
59 enum RgLineError {
60     UnknownPlatform,
61     MissingIdentifier
62 }
63 
64 /// @PG line validation error types.
65 enum PgLineError {
66     NoMatchForPreviousProgram,
67     MissingIdentifier
68 }
69 
70 /**
71   Abstract class encapsulating visitation of SAM header elements.
72 */
73 abstract class AbstractSamHeaderValidator {
74 
75     /// Start validation process.
76     ///
77     /// Passing by reference is not only for doing less copying, 
78     /// one might want to attempt to fix invalid fields 
79     /// in onError() methods.
80     void validate(ref SamHeader header) {
81         _visitHeader(header);
82     }
83 
84     /** Implement those methods to define your own behaviour.
85 
86         During validation process, in case of an error the corresponding
87         method gets called, and is provided the object where the error occurred,
88         and type of the error. Objects are passed by reference so that they
89         can be changed (fixed / cleaned up / etc.)
90 
91         'False' return value means to stop further validation checks for the 
92         current entity and skip to the next one.
93     */
94     abstract bool onError(ref SamHeader header, SamHeaderError error);
95     abstract bool onError(ref SqLine line, SqLineError error); /// ditto
96     abstract bool onError(ref PgLine line, PgLineError error); /// ditto
97     abstract bool onError(ref RgLine line, RgLineError error); /// ditto
98 
99 private:
100 
101     bool isValid(ref SqLine sq) {
102 
103         /// All members of SqLine get initialized.
104         /// Initial value for name is an empty string,
105         /// and for sequence_length is 0
106 
107         bool result = true;
108 
109         if (sq.name.length == 0) {
110             onError(sq, SqLineError.MissingSequenceName);
111             result = false;
112         } else {
113             // check that sequence_name is /^[!-)+-<>-~][!-~]*$/
114             auto first = sq.name[0];
115             if (!((first >= '!' && first <= ')') ||
116                   (first >= '+' && first <= '<') ||
117                   (first >= '>' && first <= '~'))) 
118             {
119                 onError(sq, SqLineError.InvalidSequenceName);
120                 result = false;
121             }
122             
123             if (!all!"a >= '!' && a <= '~'"(sq.name[1..$])) {
124                 onError(sq, SqLineError.InvalidSequenceName);
125                 result = false;
126             }
127         }
128 
129         // @SQ/LN must be in range 1 .. (1<<29)-1
130         // (sequence_length is uint)
131         if (sq.length == 0 || sq.length >= (1<<29)) 
132         {
133             onError(sq, SqLineError.SequenceLengthOutOfRange);
134             result = false;
135         }
136         
137         return result;
138     }
139 
140     bool isValid(ref RgLine rg) {
141         bool res = canFind(["ILLUMINA",
142                             "SOLID",
143                             "LS454",
144                             "HELICOS",
145                             "PACBIO"],
146                            rg.platform);
147         if (!res) {
148             onError(rg, RgLineError.UnknownPlatform);
149         }
150         
151         if (rg.identifier.length == 0) {
152             onError(rg, RgLineError.MissingIdentifier);
153             res = false;
154         }
155 
156         return res;
157     }
158 
159     bool isValid(ref PgLine pg) {
160 
161         // checking PP tag occurs in _visitHeader()
162         // because it involves other @PG lines
163         
164         if (pg.identifier.length == 0) {
165             onError(pg, PgLineError.MissingIdentifier);
166             return false;
167         }
168         
169         return true;
170     }
171 
172     void _visitHeader(ref SamHeader header) {
173 
174         foreach (sq; header.sequences) {
175             if (!isValid(sq)) if (!onError(header, SamHeaderError.InvalidSqLine)) return;
176         }
177 
178         foreach (rg; header.read_groups) {
179             if (!isValid(rg)) if (!onError(header, SamHeaderError.InvalidRgLine)) return;
180         }
181 
182         foreach (pg; header.programs) {
183             if (!isValid(pg)) if (!onError(header, SamHeaderError.InvalidPgLine)) return;
184         }
185 
186         if (!checkFormatVersion(header.format_version)) {
187             if (!onError(header, SamHeaderError.InvalidFormatVersion)) return;
188         }
189 
190         // uniqueness of @SQ/SN, @RG/ID, and @PG/ID
191         // is guaranteed by design of HeaderLineDictionary template class
192 
193         // check that each @PG/PP matches some @PG/ID
194         foreach (pg; header.programs) {
195             if (pg.previous_program.length != 0) {
196                 if (!canFind(map!"a.identifier"(header.programs.values),
197                              pg.previous_program)) 
198                 {
199                     if (!onError(pg, PgLineError.NoMatchForPreviousProgram)) return;
200                 }
201             }
202         }
203     } // visitHeader
204 
205 } // AbstractSamHeaderValidator
206 
207 private {
208 
209 /// check that @HD/VN is /^[0-9]+\.[0-9]+$/ 
210 bool checkFormatVersion(string ver) nothrow {
211 
212     if (ver.length == 0) {
213         return false; // must be non-empty
214     }
215 
216     if (!isDigit(ver[0])) {
217         return false; // and it must start with digit
218     }
219 
220     ver = ver[1..$];
221 
222     bool passed_dot = false;
223 
224     while (ver.length > 0) {
225         if (isDigit(ver[0])) {
226             ver = ver[1..$]; // skip digits
227         } else if (ver[0] == '.') {
228             if (passed_dot) {
229                 return false; // must contain only one dot
230             }
231             passed_dot = true;
232             ver = ver[1..$];
233             if (ver.length == 0 || !isDigit(ver[0])) {
234                 return false; // there must be a digit after dot
235             }
236         }
237     }
238 
239     return true;
240 }
241 
242 unittest {
243     assert(checkFormatVersion("1.53") == true);
244     assert(checkFormatVersion("a.23") == false);
245     assert(checkFormatVersion("1.2.3") == false);
246     assert(checkFormatVersion("5.") == false);
247     assert(checkFormatVersion("3.141592653589793") == true);
248     assert(checkFormatVersion("100500.42") == true);
249     assert(checkFormatVersion("2.71828.3.5") == false);
250 }
251 
252 final private class BooleanValidator : AbstractSamHeaderValidator {
253 
254     bool result;
255 
256     override void validate(ref SamHeader header) {
257         result = true;
258         super.validate(header);
259     }
260 
261     override bool onError(ref SamHeader header, SamHeaderError e) {
262         return (result = false);
263     }
264 
265     override bool onError(ref SqLine line, SqLineError e) {
266         return (result = false);
267     }
268 
269     override bool onError(ref RgLine header, RgLineError e) {
270         return (result = false);
271     }
272 
273     override bool onError(ref PgLine header, PgLineError e) {
274         return (result = false);
275     }
276 }
277 
278 static BooleanValidator booleanValidator;
279 
280 } // private
281 
282 static this() {
283     booleanValidator = new BooleanValidator();
284 }
285 
286 /// Check if header is valid
287 bool isValid(SamHeader header) {
288     booleanValidator.validate(header);
289     return booleanValidator.result;
290 }
291 
292 unittest {
293     auto valid_header = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:1575");
294     assert(isValid(valid_header));
295 
296     auto empty_seq_name = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:\tLN:1575");
297     assert(!isValid(empty_seq_name));
298 
299     auto missing_seq_name = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tLN:1575");
300     assert(!isValid(missing_seq_name));
301 
302     auto missing_seq_length = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1");
303     assert(!isValid(missing_seq_length));
304 
305     auto seq_length_out_of_range = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:876543210");
306     assert(!isValid(seq_length_out_of_range));
307 
308     auto invalid_seq_name = new SamHeader("@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr \tLN:1575");
309     assert(!isValid(invalid_seq_name));
310 
311     auto missing_version = new SamHeader("@HD\tSO:coordinate");
312     assert(!isValid(missing_version));
313 
314     auto invalid_version_format = new SamHeader("@HD\tVN:6.7.8");
315     assert(!isValid(invalid_version_format));
316 
317     auto unknown_platform = new SamHeader("@RG\tID:678\tPL:TROLOLO");
318     assert(!isValid(unknown_platform));
319 
320     auto missing_rg_id = new SamHeader("@RG\tPL:ILLUMINA");
321     assert(!isValid(missing_rg_id));
322 
323     auto missing_pg_id = new SamHeader("@PG\tPN:bwa\tVN:0.5.9-r16");
324     assert(!isValid(missing_pg_id));
325 
326     auto unknown_previous_program = new SamHeader("@PG\tID:bwa_aln_fastq\tPN:bwa\tPP:bwa_index");
327     assert(!isValid(unknown_previous_program));
328 
329     auto another_valid_header = new SamHeader(q"[@HD	VN:1.0	SO:coordinate
330 @SQ	SN:1	LN:249250621	M5:1b22b98cdeb4a9304cb5d48026a85128	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
331 @SQ	SN:2	LN:243199373	M5:a0d9851da00400dec1098a9255ac712e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
332 @SQ	SN:3	LN:198022430	M5:fdfd811849cc2fadebc929bb925902e5	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
333 @SQ	SN:4	LN:191154276	M5:23dccd106897542ad87d2765d28a19a1	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
334 @SQ	SN:5	LN:180915260	M5:0740173db9ffd264d728f32784845cd7	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
335 @SQ	SN:6	LN:171115067	M5:1d3a93a248d92a729ee764823acbbc6b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
336 @SQ	SN:7	LN:159138663	M5:618366e953d6aaad97dbe4777c29375e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
337 @SQ	SN:8	LN:146364022	M5:96f514a9929e410c6651697bded59aec	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
338 @SQ	SN:9	LN:141213431	M5:3e273117f15e0a400f01055d9f393768	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
339 @SQ	SN:10	LN:135534747	M5:988c28e000e84c26d552359af1ea2e1d	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
340 @SQ	SN:11	LN:135006516	M5:98c59049a2df285c76ffb1c6db8f8b96	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
341 @SQ	SN:12	LN:133851895	M5:51851ac0e1a115847ad36449b0015864	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
342 @SQ	SN:13	LN:115169878	M5:283f8d7892baa81b510a015719ca7b0b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
343 @SQ	SN:14	LN:107349540	M5:98f3cae32b2a2e9524bc19813927542e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
344 @SQ	SN:15	LN:102531392	M5:e5645a794a8238215b2cd77acb95a078	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
345 @SQ	SN:16	LN:90354753	M5:fc9b1a7b42b97a864f56b348b06095e6	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
346 @SQ	SN:17	LN:81195210	M5:351f64d4f4f9ddd45b35336ad97aa6de	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
347 @SQ	SN:18	LN:78077248	M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
348 @SQ	SN:19	LN:59128983	M5:1aacd71f30db8e561810913e0b72636d	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
349 @SQ	SN:20	LN:63025520	M5:0dec9660ec1efaaf33281c0d5ea2560f	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
350 @SQ	SN:21	LN:48129895	M5:2979a6085bfe28e3ad6f552f361ed74d	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
351 @SQ	SN:22	LN:51304566	M5:a718acaa6135fdca8357d5bfe94211dd	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
352 @SQ	SN:X	LN:155270560	M5:7e0e2e580297b7764e31dbc80c2540dd	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
353 @SQ	SN:Y	LN:59373566	M5:1fa3474750af0948bdf97d5a0ee52e51	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
354 @SQ	SN:MT	LN:16569	M5:c68f52674c9fb33aef52dcf399755519	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
355 @SQ	SN:GL000207.1	LN:4262	M5:f3814841f1939d3ca19072d9e89f3fd7	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
356 @SQ	SN:GL000226.1	LN:15008	M5:1c1b2cd1fccbc0a99b6a447fa24d1504	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
357 @SQ	SN:GL000229.1	LN:19913	M5:d0f40ec87de311d8e715b52e4c7062e1	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
358 @SQ	SN:GL000231.1	LN:27386	M5:ba8882ce3a1efa2080e5d29b956568a4	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
359 @SQ	SN:GL000210.1	LN:27682	M5:851106a74238044126131ce2a8e5847c	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
360 @SQ	SN:GL000239.1	LN:33824	M5:99795f15702caec4fa1c4e15f8a29c07	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
361 @SQ	SN:GL000235.1	LN:34474	M5:118a25ca210cfbcdfb6c2ebb249f9680	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
362 @SQ	SN:GL000201.1	LN:36148	M5:dfb7e7ec60ffdcb85cb359ea28454ee9	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
363 @SQ	SN:GL000247.1	LN:36422	M5:7de00226bb7df1c57276ca6baabafd15	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
364 @SQ	SN:GL000245.1	LN:36651	M5:89bc61960f37d94abf0df2d481ada0ec	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
365 @SQ	SN:GL000197.1	LN:37175	M5:6f5efdd36643a9b8c8ccad6f2f1edc7b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
366 @SQ	SN:GL000203.1	LN:37498	M5:96358c325fe0e70bee73436e8bb14dbd	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
367 @SQ	SN:GL000246.1	LN:38154	M5:e4afcd31912af9d9c2546acf1cb23af2	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
368 @SQ	SN:GL000249.1	LN:38502	M5:1d78abec37c15fe29a275eb08d5af236	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
369 @SQ	SN:GL000196.1	LN:38914	M5:d92206d1bb4c3b4019c43c0875c06dc0	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
370 @SQ	SN:GL000248.1	LN:39786	M5:5a8e43bec9be36c7b49c84d585107776	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
371 @SQ	SN:GL000244.1	LN:39929	M5:0996b4475f353ca98bacb756ac479140	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
372 @SQ	SN:GL000238.1	LN:39939	M5:131b1efc3270cc838686b54e7c34b17b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
373 @SQ	SN:GL000202.1	LN:40103	M5:06cbf126247d89664a4faebad130fe9c	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
374 @SQ	SN:GL000234.1	LN:40531	M5:93f998536b61a56fd0ff47322a911d4b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
375 @SQ	SN:GL000232.1	LN:40652	M5:3e06b6741061ad93a8587531307057d8	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
376 @SQ	SN:GL000206.1	LN:41001	M5:43f69e423533e948bfae5ce1d45bd3f1	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
377 @SQ	SN:GL000240.1	LN:41933	M5:445a86173da9f237d7bcf41c6cb8cc62	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
378 @SQ	SN:GL000236.1	LN:41934	M5:fdcd739913efa1fdc64b6c0cd7016779	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
379 @SQ	SN:GL000241.1	LN:42152	M5:ef4258cdc5a45c206cea8fc3e1d858cf	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
380 @SQ	SN:GL000243.1	LN:43341	M5:cc34279a7e353136741c9fce79bc4396	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
381 @SQ	SN:GL000242.1	LN:43523	M5:2f8694fc47576bc81b5fe9e7de0ba49e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
382 @SQ	SN:GL000230.1	LN:43691	M5:b4eb71ee878d3706246b7c1dbef69299	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
383 @SQ	SN:GL000237.1	LN:45867	M5:e0c82e7751df73f4f6d0ed30cdc853c0	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
384 @SQ	SN:GL000233.1	LN:45941	M5:7fed60298a8d62ff808b74b6ce820001	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
385 @SQ	SN:GL000204.1	LN:81310	M5:efc49c871536fa8d79cb0a06fa739722	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
386 @SQ	SN:GL000198.1	LN:90085	M5:868e7784040da90d900d2d1b667a1383	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
387 @SQ	SN:GL000208.1	LN:92689	M5:aa81be49bf3fe63a79bdc6a6f279abf6	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
388 @SQ	SN:GL000191.1	LN:106433	M5:d75b436f50a8214ee9c2a51d30b2c2cc	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
389 @SQ	SN:GL000227.1	LN:128374	M5:a4aead23f8053f2655e468bcc6ecdceb	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
390 @SQ	SN:GL000228.1	LN:129120	M5:c5a17c97e2c1a0b6a9cc5a6b064b714f	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
391 @SQ	SN:GL000214.1	LN:137718	M5:46c2032c37f2ed899eb41c0473319a69	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
392 @SQ	SN:GL000221.1	LN:155397	M5:3238fb74ea87ae857f9c7508d315babb	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
393 @SQ	SN:GL000209.1	LN:159169	M5:f40598e2a5a6b26e84a3775e0d1e2c81	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
394 @SQ	SN:GL000218.1	LN:161147	M5:1d708b54644c26c7e01c2dad5426d38c	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
395 @SQ	SN:GL000220.1	LN:161802	M5:fc35de963c57bf7648429e6454f1c9db	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
396 @SQ	SN:GL000213.1	LN:164239	M5:9d424fdcc98866650b58f004080a992a	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
397 @SQ	SN:GL000211.1	LN:166566	M5:7daaa45c66b288847b9b32b964e623d3	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
398 @SQ	SN:GL000199.1	LN:169874	M5:569af3b73522fab4b40995ae4944e78e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
399 @SQ	SN:GL000217.1	LN:172149	M5:6d243e18dea1945fb7f2517615b8f52e	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
400 @SQ	SN:GL000216.1	LN:172294	M5:642a232d91c486ac339263820aef7fe0	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
401 @SQ	SN:GL000215.1	LN:172545	M5:5eb3b418480ae67a997957c909375a73	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
402 @SQ	SN:GL000205.1	LN:174588	M5:d22441398d99caf673e9afb9a1908ec5	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
403 @SQ	SN:GL000219.1	LN:179198	M5:f977edd13bac459cb2ed4a5457dba1b3	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
404 @SQ	SN:GL000224.1	LN:179693	M5:d5b2fc04f6b41b212a4198a07f450e20	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
405 @SQ	SN:GL000223.1	LN:180455	M5:399dfa03bf32022ab52a846f7ca35b30	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
406 @SQ	SN:GL000195.1	LN:182896	M5:5d9ec007868d517e73543b005ba48535	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
407 @SQ	SN:GL000212.1	LN:186858	M5:563531689f3dbd691331fd6c5730a88b	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
408 @SQ	SN:GL000222.1	LN:186861	M5:6fe9abac455169f50470f5a6b01d0f59	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
409 @SQ	SN:GL000200.1	LN:187035	M5:75e4c8d17cd4addf3917d1703cacaf25	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
410 @SQ	SN:GL000193.1	LN:189789	M5:dbb6e8ece0b5de29da56601613007c2a	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
411 @SQ	SN:GL000194.1	LN:191469	M5:6ac8f815bf8e845bb3031b73f812c012	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
412 @SQ	SN:GL000225.1	LN:211173	M5:63945c3e6962f28ffd469719a747e73c	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
413 @SQ	SN:GL000192.1	LN:547496	M5:325ba9e808f669dfeee210fdd7b470ac	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
414 @SQ	SN:NC_007605	LN:171823	M5:6743bd63b3ff2b5b8985d8933c53290a	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
415 @SQ	SN:hs37d5	LN:35477943	M5:5b6a4b3a81a2d3c134b7d14bf6ad39f1	UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz        AS:NCBI37       SP:Human
416 @RG	ID:ERR016155	LB:HUMgdtRAGDIAAPE	SM:HG00125	PI:488	CN:BGI	PL:ILLUMINA	DS:SRP001294
417 @RG	ID:ERR016156	LB:HUMgdtRAGDIAAPE	SM:HG00125	PI:489	CN:BGI	PL:ILLUMINA	DS:SRP001294
418 @RG	ID:ERR016157	LB:HUMgdtRAGDIAAPE	SM:HG00125	PI:488	CN:BGI	PL:ILLUMINA	DS:SRP001294
419 @PG	ID:bwa_index	PN:bwa	VN:0.5.9-r16	CL:bwa index -a bwtsw $reference_fasta
420 @PG	ID:bwa_aln_fastq	PN:bwa	PP:bwa_index	VN:0.5.9-r16	CL:bwa aln -q 15 -f $sai_file $reference_fasta $fastq_file
421 @PG	ID:bwa_sam	PN:bwa	PP:bwa_aln_fastq	VN:0.5.9-r16	CL:bwa sampe -a 1464 -r $rg_line -f $sam_file $reference_fasta $sai_file(s) $fastq_file(s)
422 @PG	ID:bwa_sam.1	PN:bwa	PP:bwa_aln_fastq	VN:0.5.9-r16	CL:bwa sampe -a 1467 -r $rg_line -f $sam_file $reference_fasta $sai_file(s) $fastq_file(s)
423 @PG	ID:sam_to_fixed_bam	PN:samtools	PP:bwa_sam	VN:0.1.17 (r973:277)	CL:samtools view -bSu $sam_file | samtools sort -n -o - samtools_nsort_tmp | samtools fixmate /dev/stdin /dev/stdout | samtools sort -o - samtools_csort_tmp | samtools fillmd -u - $reference_fasta > $fixed_bam_file
424 @PG	ID:sam_to_fixed_bam.1	PN:samtools	PP:bwa_sam.1	VN:0.1.17 (r973:277)	CL:samtools view -bSu $sam_file | samtools sort -n -o - samtools_nsort_tmp | samtools fixmate /dev/stdin /dev/stdout | samtools sort -o - samtools_csort_tmp | samtools fillmd -u - $reference_fasta > $fixed_bam_file
425 @PG	ID:gatk_target_interval_creator	PN:GenomeAnalysisTK	PP:sam_to_fixed_bam	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R $reference_fasta -o $intervals_file -known $known_indels_file(s) 
426 @PG	ID:gatk_target_interval_creator.1	PN:GenomeAnalysisTK	PP:sam_to_fixed_bam.1	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R $reference_fasta -o $intervals_file -known $known_indels_file(s) 
427 @PG	ID:bam_realignment_around_known_indels	PN:GenomeAnalysisTK	PP:gatk_target_interval_creator	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T IndelRealigner -R $reference_fasta -I $bam_file -o $realigned_bam_file -targetIntervals $intervals_file -known $known_indels_file(s) -LOD 0.4 -model KNOWNS_ONLY -compress 0 --disable_bam_indexing
428 @PG	ID:bam_realignment_around_known_indels.1	PN:GenomeAnalysisTK	PP:gatk_target_interval_creator.1	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T IndelRealigner -R $reference_fasta -I $bam_file -o $realigned_bam_file -targetIntervals $intervals_file -known $known_indels_file(s) -LOD 0.4 -model KNOWNS_ONLY -compress 0 --disable_bam_indexing
429 @PG	ID:bam_count_covariates	PN:GenomeAnalysisTK	PP:bam_realignment_around_known_indels	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T CountCovariates -R $reference_fasta -I $bam_file -recalFile $bam_file.recal_data.csv -knownSites $known_sites_file(s) -l INFO -L '1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;X;Y;MT' -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate
430 @PG	ID:bam_count_covariates.1	PN:GenomeAnalysisTK	PP:bam_realignment_around_known_indels.1	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T CountCovariates -R $reference_fasta -I $bam_file -recalFile $bam_file.recal_data.csv -knownSites $known_sites_file(s) -l INFO -L '1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;X;Y;MT' -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate
431 @PG	ID:bam_recalibrate_quality_scores	PN:GenomeAnalysisTK	PP:bam_count_covariates	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T TableRecalibration -R $reference_fasta -recalFile $bam_file.recal_data.csv -I $bam_file -o $recalibrated_bam_file -l INFO -compress 0 --disable_bam_indexing
432 @PG	ID:bam_recalibrate_quality_scores.1	PN:GenomeAnalysisTK	PP:bam_count_covariates.1	VN:1.2-29-g0acaf2d	CL:java $jvm_args -jar GenomeAnalysisTK.jar -T TableRecalibration -R $reference_fasta -recalFile $bam_file.recal_data.csv -I $bam_file -o $recalibrated_bam_file -l INFO -compress 0 --disable_bam_indexing
433 @PG	ID:bam_calculate_bq	PN:samtools	PP:bam_recalibrate_quality_scores	VN:0.1.17 (r973:277)	CL:samtools calmd -Erb $bam_file $reference_fasta > $bq_bam_file
434 @PG	ID:bam_calculate_bq.1	PN:samtools	PP:bam_recalibrate_quality_scores.1	VN:0.1.17 (r973:277)	CL:samtools calmd -Erb $bam_file $reference_fasta > $bq_bam_file
435 @PG	ID:bam_merge	PN:picard	PP:bam_calculate_bq	VN:1.53	CL:java $jvm_args -jar MergeSamFiles.jar INPUT=$bam_file(s) OUTPUT=$merged_bam VALIDATION_STRINGENCY=SILENT
436 @PG	ID:bam_merge.1	PN:picard	PP:bam_calculate_bq.1	VN:1.53	CL:java $jvm_args -jar MergeSamFiles.jar INPUT=$bam_file(s) OUTPUT=$merged_bam VALIDATION_STRINGENCY=SILENT
437 @PG	ID:bam_mark_duplicates	PN:picard	PP:bam_merge	VN:1.53	CL:java $jvm_args -jar MarkDuplicates.jar INPUT=$bam_file OUTPUT=$markdup_bam_file ASSUME_SORTED=TRUE METRICS_FILE=/dev/null VALIDATION_STRINGENCY=SILENT
438 @PG	ID:bam_mark_duplicates.1	PN:picard	PP:bam_merge.1	VN:1.53	CL:java $jvm_args -jar MarkDuplicates.jar INPUT=$bam_file OUTPUT=$markdup_bam_file ASSUME_SORTED=TRUE METRICS_FILE=/dev/null VALIDATION_STRINGENCY=SILENT
439 @PG	ID:bam_merge.2	PN:picard	PP:bam_mark_duplicates	VN:1.53	CL:java $jvm_args -jar MergeSamFiles.jar INPUT=$bam_file(s) OUTPUT=$merged_bam VALIDATION_STRINGENCY=SILENT
440 @PG	ID:bam_merge.1.2	PN:picard	PP:bam_mark_duplicates.1	VN:1.53	CL:java $jvm_args -jar MergeSamFiles.jar INPUT=$bam_file(s) OUTPUT=$merged_bam VALIDATION_STRINGENCY=SILENT
441 @CO	$known_indels_file(s) = ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_mapping_resources/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.indels.sites.vcf.gz
442 @CO	$known_indels_file(s) .= ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_mapping_resources/ALL.wgs.low_coverage_vqsr.20101123.indels.sites.vcf.gz
443 @CO	$known_sites_file(s) = ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_mapping_resources/ALL.wgs.dbsnp.build135.snps.sites.vcf.gz
444 ]");
445     assert(isValid(another_valid_header));
446 }