1 /*
2    This file is part of BioD.
3    Copyright (C) 2016   George Githinji <biorelated@gmail.com>
4    Copyright (C) 2018   Emilio Palumbo <emiliopalumbo@gmail.com>
5    Permission is hereby granted, free of charge, to any person obtaining a
6    copy of this software and associated documentation files (the "Software"),
7    to deal in the Software without restriction, including without limitation
8    the rights to use, copy, modify, merge, publish, distribute, sublicense,
9    and/or sell copies of the Software, and to permit persons to whom the
10    Software is furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21    DEALINGS IN THE SOFTWARE.
22  */
23 
24 module bio.std.file.fasta;
25 
26 import std.array;
27 import std..string;
28 import std.algorithm;
29 import std.stdio;
30 import std.range;
31 import std.conv;
32 import std.math;
33 import std.typecons;
34 import std.exception;
35 import std.path;
36 import std.file;
37 import bio.std.file.fai;
38 
39 /* 
40   A text-based single-letter format for representing 
41   nucleotide (nt) and amino-acid (aa) sequences.
42 
43   The ">" symbol/character marks the start of a fasta entry.
44   Each fasta entry comprise of an alphanumeric definition line followed by a 
45   newline character and a single or multiline sequence of IUPAC codes used to
46   represent nucleotide or amino-acid sequences.
47 
48   An example of a nucleotide fasta file 
49   
50   >Entry1_ID header field1|field2|...
51   TTGACGGGTTTTTGTCCTGATT
52  
53   >Entry2_ID header field1|field2|...
54   ATTTTGGGTTACTGTTGGTTTTTGGGC
55 
56   TODO: 
57  1. Allow reading gzipped fasta files.
58  
59 */
60 
61 struct FastaRecord {
62     string header;
63     string sequence;
64     ulong lineLen;
65     string lineTerm = "\n";
66     
67     // split the header to array of fields 
68     @property string[] headerFields(){
69         return split(header, "|").map!strip.array;
70     }
71 
72     // return the length of the sequence
73     @property ulong len() {
74         return sequence.length;
75     }
76 
77     string toString() {
78         string seq;
79         ulong i = lineLen;
80         for (; i<sequence.length; i+=lineLen) {
81             seq~=sequence[i-lineLen..i]~"\n";
82         }
83         seq~=sequence[i-lineLen..$];
84         return format(">%s\n%s", header, seq);
85     }
86    
87     unittest {
88         auto recString = q"(
89             >chr2
90             acgtgagtgcacgtgagtgcacgtgagtgc
91             acgtgagtgcacgtgagtgc
92         )".outdent().strip();
93         auto header = "chr2";
94         auto seq = "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc";
95         auto lineLen = 30;
96         auto rec = FastaRecord(header, seq, lineLen);
97         assert (rec.toString() == recString);
98         assert (rec.len == seq.length);
99 
100     }
101 }
102 
103 struct Region {
104     string reference;
105     ulong beg, end;
106     @property len() {
107         return end - beg;
108     }
109     
110     string toString() {
111         if ( end == 0 ) {
112             if ( beg == 0 )
113                 return reference;
114             else
115                 return format("%s:%s", reference, beg+1);
116         }
117         return format("%s:%s-%s", reference, beg+1, end);
118     }
119     
120     this(string q) {
121         auto res = q.split(":");
122         reference = res[0];
123         if ( res.length > 2) {
124             throw new Exception(format("Wrong format for region: %s", q));
125         } else if (res.length > 1) {
126             auto boundaries = res[1].replace(",","").split("-").map!(to!ulong);
127             beg = boundaries[0]-1;
128             if ( boundaries.length == 2 ) {
129                 end = boundaries[1];
130             }
131         }
132     }
133 
134     unittest {
135         assert ( Region("chr2:1-10").toString() == "chr2:1-10" );
136         assert ( Region("chr2:1-10").len == 10 );
137         assert ( Region("chr2").toString() == "chr2" );
138         assert ( Region("chr2:10").toString() == "chr2:10" );
139         assertThrown!Exception( Region("chr2:1:10") );
140 
141         auto region1 = Region("chr1:1,000-2000");
142         assert(region1.reference == "chr1");
143         assert(region1.beg == 999);
144         assert(region1.end == 2000);
145 
146         auto region2 = Region("chr2");
147         assert(region2.reference == "chr2");
148         assert(region2.beg == 0);
149         assert(region2.end == 0);
150 
151         auto region3 = Region("chr3:1,000,000");
152         assert(region3.reference == "chr3");
153         assert(region3.beg == 999_999);
154         assert(region3.end == 0);
155     }
156 }
157 
158 auto fastaRecords(string filename) {
159 
160     File f = File(filename);
161     FastaRecord[] records; 
162     string lineTerm = f.byLine(KeepTerminator.yes).take(1).front.endsWith("\r\n") ? "\r\n" : "\n";
163     f.seek(0);
164     ulong offset;
165     foreach(line; f.byLine(KeepTerminator.no, lineTerm)) {
166         offset+= line.length + lineTerm.length;
167         if ( line.startsWith(">") ) {
168             records~=FastaRecord();
169             records[$-1].lineTerm = lineTerm;
170             records[$-1].header ~= line[1..$];
171         } else {
172             if ( records[$-1].lineLen == 0 ) {
173                 records[$-1].lineLen = line.length;
174             }
175             records[$-1].sequence ~= line;
176         }
177     }
178 
179     return records;
180 }
181 
182 unittest {
183     auto testFa = tempDir.buildPath("test.fa");
184     scope(exit) testFa.remove;
185     File(testFa, "w").writeln(q"(
186         >chr1
187         acgtgagtgc
188         >chr2
189         acgtgagtgcacgtgagtgcacgtgagtgc
190         acgtgagtgcacgtgagtgc
191         >chr3 hrsv | Kilifi | partial sequence
192         CATGTTATTACAAGTAGTGATATTTGCCCTAATAATAATATTGTAGTGAAATCCAATTTCACAACAATGC
193     )".outdent().strip());
194     auto records = fastaRecords(testFa);
195     assert ( records.length == 3 );
196     assert ( records[0].header == "chr1" );
197     assert ( records[0].sequence == "acgtgagtgc" );
198     assert ( records[0].lineLen == 10 );
199     assert ( records[0].toString == q"(
200         >chr1
201         acgtgagtgc
202     )".outdent().strip());
203     assert ( records[1].header == "chr2" );
204     assert ( records[1].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" );
205     assert ( records[1].lineLen == 30 );
206     assert ( records[1].toString == q"(
207         >chr2
208         acgtgagtgcacgtgagtgcacgtgagtgc
209         acgtgagtgcacgtgagtgc
210     )".outdent().strip());
211 
212     assert(records[2].header == "chr3 hrsv | Kilifi | partial sequence" );
213     assert(records[2].headerFields.length == 3);
214     assert(records[2].headerFields == ["chr3 hrsv","Kilifi","partial sequence"]);
215 }
216 
217 auto fastaRegions(string filename, string[] queries) {
218     File f = File(filename);
219     FaiRecord[string] index = makeIndex(readFai(filename~=".fai"));
220     Region[] regions = to!(Region[])(queries);
221     return fetchFastaRegions(f, index, regions);
222 }
223 
224 auto fetchFastaRegions(File fasta, FaiRecord[string] index, Region[] regions) {
225 
226     FastaRecord[] records;
227 
228     foreach (region; regions) {
229         string seq;
230         ulong len;
231         if ( region.reference in index ) {
232             auto reference = index[region.reference];
233             fasta.seek(reference.offset+region.beg+region.beg/reference.lineLen);
234             size_t bufLen;
235             if ( region.end == 0 ) 
236                 bufLen = reference.seqLen + reference.seqLen/reference.lineLen;
237             else
238                 bufLen = region.len + region.len/reference.lineLen;
239             seq = fasta.rawRead(new char[bufLen]).replace(reference.lineTerm,"").idup;
240             len = reference.lineLen;
241         }
242         records ~= FastaRecord(region.to!string, seq, len);
243     }
244 
245     return records;    
246 }
247 
248 unittest {
249     auto testFa = tempDir.buildPath("test.fa");
250     scope(exit) testFa.remove;
251     File(testFa, "w").writeln(q"(
252         >chr1
253         acgtgagtgc
254         >chr2
255         acgtgagtgcacgtgagtgcacgtgagtgc
256         acgtgagtgcacgtgagtgc
257     )".outdent().strip());
258     auto faiString = "
259         chr1\t10\t6\t10\t11
260         chr2\t50\t23\t30\t31
261     ".outdent().strip();
262     auto testIndex = tempDir.buildPath("test.fa.fai");
263     scope(exit) testIndex.remove;
264     File(testIndex, "w").writeln(faiString);
265     
266     auto regions = fastaRegions(testFa, ["chr1:4-6", "chr2:36-45"]);
267     assert ( regions.length == 2 );
268     assert ( regions[0].header == "chr1:4-6" );
269     assert ( regions[0].len == 3 );
270     assert ( regions[0].sequence == "tga" );
271     assert ( regions[0].lineLen == 10 );
272     assert ( regions[1].header == "chr2:36-45" );
273     assert ( regions[1].len == 10 );
274     assert ( regions[1].sequence == "agtgcacgtg" );
275     assert ( regions[1].lineLen == 30 );
276     
277     regions = fastaRegions(testFa, ["chr1"]);
278     assert ( regions.length == 1 );
279     assert ( regions[0].header == "chr1" );
280     assert ( regions[0].len == 10 );
281     assert ( regions[0].sequence == "acgtgagtgc" );
282     assert ( regions[0].lineLen == 10 );
283     
284     regions = fastaRegions(testFa, ["chr2"]);
285     assert ( regions.length == 1 );
286     assert ( regions[0].header == "chr2" );
287     assert ( regions[0].len == 50 );
288     assert ( regions[0].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" );
289     assert ( regions[0].lineLen == 30 );
290 }