1 /*
2     This file is part of BioD.
3     Copyright (C) 2012    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.bam.baifile;
25 
26 public import bio.core.bgzf.chunk;
27 public import bio.bam.bai.bin;
28 import bio.core.bgzf.virtualoffset;
29 import bio.bam.constants;
30 
31 import undead.stream;
32 import std.system;
33 import std.exception;
34 import std.algorithm;
35 import std.conv;
36 import std.range;
37 import std.file;
38 import std.path;
39 
40 /// Represents index for a single reference
41 struct Index {
42     /// Information about bins
43     Bin[] bins;
44 
45     /// Virtual file offsets of first alignments overlapping 16384-byte windows
46     /// on the reference sequence. This linear index is used to reduce amount
47     /// of file seeks for region queries, since with its help one can reduce the
48     /// number of chunks to be investigated based on their end position.
49     ///
50     ///
51     /// Suppose you have a region [beg, end) and want to do a query.
52     ///
53     /// Here's the reference:
54     /// [....................!..............!.................................]
55     ///                     beg            end
56     ///
57     /// Here's the same reference with 16384-byte long windows:
58     /// [%...........%.......!....%.........!..%...........%...........%......]
59     ///                     beg            end
60     /// [ 1st window][ 2nd window][...
61     ///
62     /// With linear index, we can take the second window, find out what is 
63     /// the minimum virtual offset among alignments overlapping this window,
64     /// and skip all chunks which end position is less or equal to this offset:
65     ///
66     /// [........@...........!..............!.................................]
67     ///   .  ..min. offset   beg           end
68     ///   [  ).        .                              <- this chunk is skipped
69     ///       [        )                              <- this one is not
70     ///
71     VirtualOffset[] ioffsets; 
72 
73     /// Get (approximate) virtual offset of the first alignment overlapping $(D position)
74     /// 
75     /// Returned virtual offset is less or equal to real offset.
76     VirtualOffset getMinimumOffset(int position) {
77         int pos = max(0, position);
78         int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1);
79         auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i];
80         return min_offset;
81     }
82 }
83 
84 struct BaiFile {
85     Index[] indices;
86 
87     /// Initialize from stream which contains BAI data
88     this(ref Stream stream) {
89         _stream = stream;
90         parse();
91     }
92 
93     /// Open BAI file given either filename of BAM file or that of BAI file.
94     this(string filename) {
95         Stream fstream;
96 
97         if (!endsWith(filename, ".bai")) {
98             /// Unfortunately, std.path.addExt is going to be deprecated
99 
100             auto first_filename = filename ~ ".bai";
101             auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai";
102 
103             if (std.file.exists(first_filename)) {
104                 fstream = new BufferedFile(absolutePath(first_filename));
105             } else {
106                 if (std.file.exists(second_filename)) {
107                     fstream = new BufferedFile(absolutePath(second_filename));
108                 } else {
109                     throw new Exception("searched for " ~ first_filename ~ " or " ~
110                                         second_filename ~ ", found neither");
111                 }
112             }
113         } else {
114             fstream = new BufferedFile(filename);
115         }
116 
117         Stream estream = new EndianStream(fstream, Endian.littleEndian);
118         this(estream);
119     }
120 
121 private:
122     Stream _stream;
123 
124     /// according to section 4.2 of SAM/BAM specification
125     void parse() {
126         auto magic = _stream.readString(4);
127         enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1");
128 
129         int n_ref;
130         _stream.read(n_ref);
131         indices = uninitializedArray!(Index[])(n_ref);
132 
133         foreach (i; 0 .. n_ref) {
134             int n_bin = void;
135             _stream.read(n_bin);
136             indices[i].bins = uninitializedArray!(Bin[])(n_bin);
137             
138             foreach (j; 0 .. n_bin) {
139                 uint id = void;
140                 _stream.read(id);
141                 auto bin = Bin(id);
142 
143                 int n_chunk = void;
144                 _stream.read(n_chunk);
145                 bin.chunks = uninitializedArray!(Chunk[])(n_chunk);
146                 
147                 foreach (k; 0 .. n_chunk) {
148                     ulong tmp = void;
149                     _stream.read(tmp);
150                     bin.chunks[k].beg = VirtualOffset(tmp);
151                     _stream.read(tmp);
152                     bin.chunks[k].end = VirtualOffset(tmp);
153                 }
154                 
155                 indices[i].bins[j] = bin;
156             }
157 
158             int n_intv = void;
159             _stream.read(n_intv);
160             indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv);
161 
162             foreach (j; 0 .. n_intv) {
163                 ulong tmp = void;
164                 _stream.read(tmp);
165                 indices[i].ioffsets[j] = VirtualOffset(tmp);
166             }
167         }
168     }
169 }