1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.bam.baifile; 25 26 public import bio.core.bgzf.chunk; 27 public import bio.bam.bai.bin; 28 import bio.core.bgzf.virtualoffset; 29 import bio.bam.constants; 30 31 import undead.stream; 32 import std.system; 33 import std.exception; 34 import std.algorithm; 35 import std.conv; 36 import std.range; 37 import std.file; 38 import std.path; 39 40 /// Represents index for a single reference 41 struct Index { 42 /// Information about bins 43 Bin[] bins; 44 45 /// Virtual file offsets of first alignments overlapping 16384-byte windows 46 /// on the reference sequence. This linear index is used to reduce amount 47 /// of file seeks for region queries, since with its help one can reduce the 48 /// number of chunks to be investigated based on their end position. 49 /// 50 /// 51 /// Suppose you have a region [beg, end) and want to do a query. 52 /// 53 /// Here's the reference: 54 /// [....................!..............!.................................] 55 /// beg end 56 /// 57 /// Here's the same reference with 16384-byte long windows: 58 /// [%...........%.......!....%.........!..%...........%...........%......] 59 /// beg end 60 /// [ 1st window][ 2nd window][... 61 /// 62 /// With linear index, we can take the second window, find out what is 63 /// the minimum virtual offset among alignments overlapping this window, 64 /// and skip all chunks which end position is less or equal to this offset: 65 /// 66 /// [........@...........!..............!.................................] 67 /// . ..min. offset beg end 68 /// [ ). . <- this chunk is skipped 69 /// [ ) <- this one is not 70 /// 71 VirtualOffset[] ioffsets; 72 73 /// Get (approximate) virtual offset of the first alignment overlapping $(D position) 74 /// 75 /// Returned virtual offset is less or equal to real offset. 76 VirtualOffset getMinimumOffset(int position) { 77 int pos = max(0, position); 78 int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1); 79 auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i]; 80 return min_offset; 81 } 82 } 83 84 struct BaiFile { 85 Index[] indices; 86 87 /// Initialize from stream which contains BAI data 88 this(ref Stream stream) { 89 _stream = stream; 90 parse(); 91 } 92 93 /// Open BAI file given either filename of BAM file or that of BAI file. 94 this(string filename) { 95 Stream fstream; 96 97 if (!endsWith(filename, ".bai")) { 98 /// Unfortunately, std.path.addExt is going to be deprecated 99 100 auto first_filename = filename ~ ".bai"; 101 auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai"; 102 103 if (std.file.exists(first_filename)) { 104 fstream = new BufferedFile(absolutePath(first_filename)); 105 } else { 106 if (std.file.exists(second_filename)) { 107 fstream = new BufferedFile(absolutePath(second_filename)); 108 } else { 109 throw new Exception("searched for " ~ first_filename ~ " or " ~ 110 second_filename ~ ", found neither"); 111 } 112 } 113 } else { 114 fstream = new BufferedFile(filename); 115 } 116 117 Stream estream = new EndianStream(fstream, Endian.littleEndian); 118 this(estream); 119 } 120 121 private: 122 Stream _stream; 123 124 /// according to section 4.2 of SAM/BAM specification 125 void parse() { 126 auto magic = _stream.readString(4); 127 enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1"); 128 129 int n_ref; 130 _stream.read(n_ref); 131 indices = uninitializedArray!(Index[])(n_ref); 132 133 foreach (i; 0 .. n_ref) { 134 int n_bin = void; 135 _stream.read(n_bin); 136 indices[i].bins = uninitializedArray!(Bin[])(n_bin); 137 138 foreach (j; 0 .. n_bin) { 139 uint id = void; 140 _stream.read(id); 141 auto bin = Bin(id); 142 143 int n_chunk = void; 144 _stream.read(n_chunk); 145 bin.chunks = uninitializedArray!(Chunk[])(n_chunk); 146 147 foreach (k; 0 .. n_chunk) { 148 ulong tmp = void; 149 _stream.read(tmp); 150 bin.chunks[k].beg = VirtualOffset(tmp); 151 _stream.read(tmp); 152 bin.chunks[k].end = VirtualOffset(tmp); 153 } 154 155 indices[i].bins[j] = bin; 156 } 157 158 int n_intv = void; 159 _stream.read(n_intv); 160 indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv); 161 162 foreach (j; 0 .. n_intv) { 163 ulong tmp = void; 164 _stream.read(tmp); 165 indices[i].ioffsets[j] = VirtualOffset(tmp); 166 } 167 } 168 } 169 }