1 /* 2 This file is part of BioD. 3 Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 module bio.std.hts.bam.baifile; 25 26 public import bio.core.bgzf.chunk; 27 public import bio.std.hts.bam.bai.bin; 28 import bio.core.bgzf.virtualoffset; 29 import bio.std.hts.bam.constants; 30 31 //TODO Remove stream dependency! 32 import contrib.undead.stream; 33 import std.system; 34 import std.exception; 35 import std.algorithm; 36 import std.conv; 37 import std.range; 38 import std.file; 39 import std.path; 40 41 /// Represents index for a single reference 42 struct Index { 43 /// Information about bins 44 Bin[] bins; 45 46 /// Virtual file offsets of first alignments overlapping 16384-byte windows 47 /// on the reference sequence. This linear index is used to reduce amount 48 /// of file seeks for region queries, since with its help one can reduce the 49 /// number of chunks to be investigated based on their end position. 50 /// 51 /// 52 /// Suppose you have a region [beg, end) and want to do a query. 53 /// 54 /// Here's the reference: 55 /// [....................!..............!.................................] 56 /// beg end 57 /// 58 /// Here's the same reference with 16384-byte long windows: 59 /// [%...........%.......!....%.........!..%...........%...........%......] 60 /// beg end 61 /// [ 1st window][ 2nd window][... 62 /// 63 /// With linear index, we can take the second window, find out what is 64 /// the minimum virtual offset among alignments overlapping this window, 65 /// and skip all chunks which end position is less or equal to this offset: 66 /// 67 /// [........@...........!..............!.................................] 68 /// . ..min. offset beg end 69 /// [ ). . <- this chunk is skipped 70 /// [ ) <- this one is not 71 /// 72 VirtualOffset[] ioffsets; 73 74 /// Get (approximate) virtual offset of the first alignment overlapping $(D position) 75 /// 76 /// Returned virtual offset is less or equal to real offset. 77 VirtualOffset getMinimumOffset(int position) { 78 int pos = max(0, position); 79 int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1); 80 auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i]; 81 return min_offset; 82 } 83 } 84 85 struct BaiFile { 86 Index[] indices; 87 88 /// Initialize from stream which contains BAI data 89 this(ref Stream stream) { 90 _stream = stream; 91 parse(); 92 } 93 94 /// Open BAI file given either filename of BAM file or that of BAI file. 95 this(string filename) { 96 Stream fstream; 97 98 if (!endsWith(filename, ".bai")) { 99 /// Unfortunately, std.path.addExt is going to be deprecated 100 101 auto first_filename = filename ~ ".bai"; 102 auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai"; 103 104 if (std.file.exists(first_filename)) { 105 fstream = new BufferedFile(absolutePath(first_filename)); 106 } else { 107 if (std.file.exists(second_filename)) { 108 fstream = new BufferedFile(absolutePath(second_filename)); 109 } else { 110 throw new Exception("searched for " ~ first_filename ~ " or " ~ 111 second_filename ~ ", found neither"); 112 } 113 } 114 } else { 115 fstream = new BufferedFile(filename); 116 } 117 118 Stream estream = new EndianStream(fstream, Endian.littleEndian); 119 this(estream); 120 } 121 122 private: 123 Stream _stream; 124 125 /// according to section 4.2 of SAM/BAM specification 126 void parse() { 127 auto magic = _stream.readString(4); 128 enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1"); 129 130 int n_ref; 131 _stream.read(n_ref); 132 indices = uninitializedArray!(Index[])(n_ref); 133 134 foreach (i; 0 .. n_ref) { 135 int n_bin = void; 136 _stream.read(n_bin); 137 indices[i].bins = uninitializedArray!(Bin[])(n_bin); 138 139 foreach (j; 0 .. n_bin) { 140 uint id = void; 141 _stream.read(id); 142 auto bin = Bin(id); 143 144 int n_chunk = void; 145 _stream.read(n_chunk); 146 bin.chunks = uninitializedArray!(Chunk[])(n_chunk); 147 148 foreach (k; 0 .. n_chunk) { 149 ulong tmp = void; 150 _stream.read(tmp); 151 bin.chunks[k].beg = VirtualOffset(tmp); 152 _stream.read(tmp); 153 bin.chunks[k].end = VirtualOffset(tmp); 154 } 155 156 indices[i].bins[j] = bin; 157 } 158 159 int n_intv = void; 160 _stream.read(n_intv); 161 indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv); 162 163 foreach (j; 0 .. n_intv) { 164 ulong tmp = void; 165 _stream.read(tmp); 166 indices[i].ioffsets[j] = VirtualOffset(tmp); 167 } 168 } 169 } 170 }