1 /* 2 This file is part of BioD. 3 Copyright (C) 2016 George Githinji <biorelated@gmail.com> 4 Copyright (C) 2018 Emilio Palumbo <emiliopalumbo@gmail.com> 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 */ 23 24 module bio.std.file.fasta; 25 26 import std.array; 27 import std..string; 28 import std.algorithm; 29 import std.stdio; 30 import std.range; 31 import std.conv; 32 import std.math; 33 import std.typecons; 34 import std.exception; 35 import std.path; 36 import std.file; 37 import bio.std.file.fai; 38 39 /* 40 A text-based single-letter format for representing 41 nucleotide (nt) and amino-acid (aa) sequences. 42 43 The ">" symbol/character marks the start of a fasta entry. 44 Each fasta entry comprise of an alphanumeric definition line followed by a 45 newline character and a single or multiline sequence of IUPAC codes used to 46 represent nucleotide or amino-acid sequences. 47 48 An example of a nucleotide fasta file 49 50 >Entry1_ID header field1|field2|... 51 TTGACGGGTTTTTGTCCTGATT 52 53 >Entry2_ID header field1|field2|... 54 ATTTTGGGTTACTGTTGGTTTTTGGGC 55 56 TODO: 57 1. Allow reading gzipped fasta files. 58 59 */ 60 61 struct FastaRecord { 62 string header; 63 string sequence; 64 ulong lineLen; 65 string lineTerm = "\n"; 66 67 // split the header to array of fields 68 @property string[] headerFields(){ 69 return split(header, "|").map!strip.array; 70 } 71 72 // return the length of the sequence 73 @property ulong len() { 74 return sequence.length; 75 } 76 77 string toString() { 78 string seq; 79 ulong i = lineLen; 80 for (; i<sequence.length; i+=lineLen) { 81 seq~=sequence[i-lineLen..i]~"\n"; 82 } 83 seq~=sequence[i-lineLen..$]; 84 return format(">%s\n%s", header, seq); 85 } 86 87 unittest { 88 auto recString = q"( 89 >chr2 90 acgtgagtgcacgtgagtgcacgtgagtgc 91 acgtgagtgcacgtgagtgc 92 )".outdent().strip(); 93 auto header = "chr2"; 94 auto seq = "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc"; 95 auto lineLen = 30; 96 auto rec = FastaRecord(header, seq, lineLen); 97 assert (rec.toString() == recString); 98 assert (rec.len == seq.length); 99 100 } 101 } 102 103 struct Region { 104 string reference; 105 ulong beg, end; 106 @property len() { 107 return end - beg; 108 } 109 110 string toString() { 111 if ( end == 0 ) { 112 if ( beg == 0 ) 113 return reference; 114 else 115 return format("%s:%s", reference, beg+1); 116 } 117 return format("%s:%s-%s", reference, beg+1, end); 118 } 119 120 this(string q) { 121 auto res = q.split(":"); 122 reference = res[0]; 123 if ( res.length > 2) { 124 throw new Exception(format("Wrong format for region: %s", q)); 125 } else if (res.length > 1) { 126 auto boundaries = res[1].replace(",","").split("-").map!(to!ulong); 127 beg = boundaries[0]-1; 128 if ( boundaries.length == 2 ) { 129 end = boundaries[1]; 130 } 131 } 132 } 133 134 unittest { 135 assert ( Region("chr2:1-10").toString() == "chr2:1-10" ); 136 assert ( Region("chr2:1-10").len == 10 ); 137 assert ( Region("chr2").toString() == "chr2" ); 138 assert ( Region("chr2:10").toString() == "chr2:10" ); 139 assertThrown!Exception( Region("chr2:1:10") ); 140 141 auto region1 = Region("chr1:1,000-2000"); 142 assert(region1.reference == "chr1"); 143 assert(region1.beg == 999); 144 assert(region1.end == 2000); 145 146 auto region2 = Region("chr2"); 147 assert(region2.reference == "chr2"); 148 assert(region2.beg == 0); 149 assert(region2.end == 0); 150 151 auto region3 = Region("chr3:1,000,000"); 152 assert(region3.reference == "chr3"); 153 assert(region3.beg == 999_999); 154 assert(region3.end == 0); 155 } 156 } 157 158 auto fastaRecords(string filename) { 159 160 File f = File(filename); 161 FastaRecord[] records; 162 string lineTerm = f.byLine(KeepTerminator.yes).take(1).front.endsWith("\r\n") ? "\r\n" : "\n"; 163 f.seek(0); 164 ulong offset; 165 foreach(line; f.byLine(KeepTerminator.no, lineTerm)) { 166 offset+= line.length + lineTerm.length; 167 if ( line.startsWith(">") ) { 168 records~=FastaRecord(); 169 records[$-1].lineTerm = lineTerm; 170 records[$-1].header ~= line[1..$]; 171 } else { 172 if ( records[$-1].lineLen == 0 ) { 173 records[$-1].lineLen = line.length; 174 } 175 records[$-1].sequence ~= line; 176 } 177 } 178 179 return records; 180 } 181 182 unittest { 183 auto testFa = tempDir.buildPath("test.fa"); 184 scope(exit) testFa.remove; 185 File(testFa, "w").writeln(q"( 186 >chr1 187 acgtgagtgc 188 >chr2 189 acgtgagtgcacgtgagtgcacgtgagtgc 190 acgtgagtgcacgtgagtgc 191 >chr3 hrsv | Kilifi | partial sequence 192 CATGTTATTACAAGTAGTGATATTTGCCCTAATAATAATATTGTAGTGAAATCCAATTTCACAACAATGC 193 )".outdent().strip()); 194 auto records = fastaRecords(testFa); 195 assert ( records.length == 3 ); 196 assert ( records[0].header == "chr1" ); 197 assert ( records[0].sequence == "acgtgagtgc" ); 198 assert ( records[0].lineLen == 10 ); 199 assert ( records[0].toString == q"( 200 >chr1 201 acgtgagtgc 202 )".outdent().strip()); 203 assert ( records[1].header == "chr2" ); 204 assert ( records[1].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" ); 205 assert ( records[1].lineLen == 30 ); 206 assert ( records[1].toString == q"( 207 >chr2 208 acgtgagtgcacgtgagtgcacgtgagtgc 209 acgtgagtgcacgtgagtgc 210 )".outdent().strip()); 211 212 assert(records[2].header == "chr3 hrsv | Kilifi | partial sequence" ); 213 assert(records[2].headerFields.length == 3); 214 assert(records[2].headerFields == ["chr3 hrsv","Kilifi","partial sequence"]); 215 } 216 217 auto fastaRegions(string filename, string[] queries) { 218 File f = File(filename); 219 FaiRecord[string] index = makeIndex(readFai(filename~=".fai")); 220 Region[] regions = to!(Region[])(queries); 221 return fetchFastaRegions(f, index, regions); 222 } 223 224 auto fetchFastaRegions(File fasta, FaiRecord[string] index, Region[] regions) { 225 226 FastaRecord[] records; 227 228 foreach (region; regions) { 229 string seq; 230 ulong len; 231 if ( region.reference in index ) { 232 auto reference = index[region.reference]; 233 fasta.seek(reference.offset+region.beg+region.beg/reference.lineLen); 234 size_t bufLen; 235 if ( region.end == 0 ) 236 bufLen = reference.seqLen + reference.seqLen/reference.lineLen; 237 else 238 bufLen = region.len + region.len/reference.lineLen; 239 seq = fasta.rawRead(new char[bufLen]).replace(reference.lineTerm,"").idup; 240 len = reference.lineLen; 241 } 242 records ~= FastaRecord(region.to!string, seq, len); 243 } 244 245 return records; 246 } 247 248 unittest { 249 auto testFa = tempDir.buildPath("test.fa"); 250 scope(exit) testFa.remove; 251 File(testFa, "w").writeln(q"( 252 >chr1 253 acgtgagtgc 254 >chr2 255 acgtgagtgcacgtgagtgcacgtgagtgc 256 acgtgagtgcacgtgagtgc 257 )".outdent().strip()); 258 auto faiString = " 259 chr1\t10\t6\t10\t11 260 chr2\t50\t23\t30\t31 261 ".outdent().strip(); 262 auto testIndex = tempDir.buildPath("test.fa.fai"); 263 scope(exit) testIndex.remove; 264 File(testIndex, "w").writeln(faiString); 265 266 auto regions = fastaRegions(testFa, ["chr1:4-6", "chr2:36-45"]); 267 assert ( regions.length == 2 ); 268 assert ( regions[0].header == "chr1:4-6" ); 269 assert ( regions[0].len == 3 ); 270 assert ( regions[0].sequence == "tga" ); 271 assert ( regions[0].lineLen == 10 ); 272 assert ( regions[1].header == "chr2:36-45" ); 273 assert ( regions[1].len == 10 ); 274 assert ( regions[1].sequence == "agtgcacgtg" ); 275 assert ( regions[1].lineLen == 30 ); 276 277 regions = fastaRegions(testFa, ["chr1"]); 278 assert ( regions.length == 1 ); 279 assert ( regions[0].header == "chr1" ); 280 assert ( regions[0].len == 10 ); 281 assert ( regions[0].sequence == "acgtgagtgc" ); 282 assert ( regions[0].lineLen == 10 ); 283 284 regions = fastaRegions(testFa, ["chr2"]); 285 assert ( regions.length == 1 ); 286 assert ( regions[0].header == "chr2" ); 287 assert ( regions[0].len == 50 ); 288 assert ( regions[0].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" ); 289 assert ( regions[0].lineLen == 30 ); 290 }