1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2017    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 module bio.std.hts.bam.bai.indexing;
25 
26 import bio.std.hts.bam.reader;
27 import bio.std.hts.bam.readrange;
28 import bio.std.hts.bam.constants;
29 
30 import bio.std.hts.bam.bai.bin;
31 import bio.core.bgzf.chunk;
32 
33 import contrib.undead.stream;
34 import std.array;
35 import std.algorithm;
36 import std.system;
37 import std.exception;
38 import core.stdc..string;
39 
40 // Suppose we have an alignment which covers bases on a reference,
41 // starting from one position and ending at another position.
42 // In order to build linear index, we need to find to which windows
43 // the two positions correspond. 
44 //
45 //
46 // (K = 16384)
47 //
48 // [0, K)[K, 2K)[2K, 3K)...         <- windows
49 //    [.......)                     <- alignment
50 // 
51 private size_t toLinearIndexOffset(int position) {
52     return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE;
53 }
54 
55 ///
56 struct IndexBuilder {
57     private {
58         // array of linear offsets for the current reference entry
59         VirtualOffset[] _linear_index;
60         // (maximum index in _linear_index where data was written) + 1
61         size_t _linear_index_write_length;
62 
63         static struct PreviousRead {
64             int ref_id = -1;
65             int position;
66             int end_position;
67             int basesCovered() { return end_position - position; }
68             Bin bin;
69             bool is_unmapped;
70 
71             private char[256] _name_buf; // by spec., name length is <= 255 chars
72             string name;
73 
74             VirtualOffset start_virtual_offset;
75             VirtualOffset end_virtual_offset;
76         }
77 
78         PreviousRead _prev_read;
79 
80         Stream _stream;
81         int _n_refs;
82 
83         ulong _no_coord = 0;
84         // metadata for each reference
85         ulong _beg_vo = -1UL;
86         ulong _end_vo = 0;
87         ulong _unmapped = 0;
88         ulong _mapped = 0;
89 
90         bool _first_read = true;
91 
92         // map: bin ID -> array of chunks
93         Chunk[][uint] _chunks;
94 
95         VirtualOffset _current_chunk_beg; // start of current chunk
96 
97         // no metadata for empty references
98         void writeEmptyReference() { 
99             _stream.write(cast(int)0); // n_bins
100             _stream.write(cast(int)0); // n_intv
101         }
102 
103         void updateLastReadInfo(ref BamReadBlock read) {
104             with (_prev_read) {
105                 ref_id = read.ref_id;
106                 position = read.position;
107                 end_position = position + read.basesCovered();
108                 bin = read.bin;
109                 is_unmapped = read.is_unmapped;
110                 _name_buf[0 .. read.name.length] = read.name[];
111                 name = cast(string)_name_buf[0 .. read.name.length];
112                 start_virtual_offset = read.start_virtual_offset;
113                 end_virtual_offset = read.end_virtual_offset;
114             }
115         }
116 
117         void updateMetadata(ref BamReadBlock read) {
118             if (read.ref_id == -1) {
119                 ++_no_coord;
120             } else {
121                 if (read.is_unmapped) {
122                     ++_unmapped;
123                 } else {
124                     ++_mapped;
125                 }
126 
127                 if (_beg_vo == -1UL)
128                     _beg_vo = cast(ulong)read.start_virtual_offset;
129                 _end_vo = cast(ulong)read.end_virtual_offset;
130             }
131         }
132 
133         void updateLinearIndex() {
134             assert(_prev_read.ref_id >= 0);
135 
136             size_t beg, end;
137 
138             if (_prev_read.is_unmapped) {
139                 end = beg = toLinearIndexOffset(_prev_read.position);
140             } else {
141                 beg = toLinearIndexOffset(_prev_read.position);
142                 end = toLinearIndexOffset(_prev_read.position + _prev_read.basesCovered() - 1);
143             }
144 
145             debug {
146                 import std.stdio;
147                 if (end >= _linear_index.length) {
148                     writeln("beg: ", beg);
149                     writeln("end: ", end);
150                     writeln("pos: ", _prev_read.position);
151                     writeln("bases: ", _prev_read.basesCovered());
152                 }
153             }
154 
155             foreach (i; beg .. end + 1)
156                 if (_linear_index[i] == 0UL)
157                     _linear_index[i] = _prev_read.start_virtual_offset;
158 
159             if (end + 1 > _linear_index_write_length)
160                 _linear_index_write_length = end + 1;
161         }
162 
163         void dumpCurrentLinearIndex() {
164             _stream.write(cast(int)_linear_index_write_length);
165 
166             //                                                                 
167             // There might be untouched places in linear index                 
168             // with virtual offset equal to zero.                              
169             // However, it's not a good idea to leave those zeros,             
170             // since we can start lookup from the last non-zero virtual offset 
171             // encountered before the untouched window.                        
172             //                                                                 
173             VirtualOffset last_voffset = 0;
174 
175             foreach (voffset; _linear_index[0 .. _linear_index_write_length]) {
176                 if (voffset == 0)
177                     voffset = last_voffset;
178                 else
179                     last_voffset = voffset;
180                 _stream.write(cast(ulong)voffset);
181             }
182         }
183 
184         void dumpCurrentReference() {
185             // +1 because we output dummy bin, too
186             _stream.write(cast(int)(_chunks.length + 1));
187 
188             foreach (bin_id, bin_chunks; _chunks) {
189                 if (bin_chunks.length > 0) {
190                     _stream.write(cast(uint)bin_id);
191                     _stream.write(cast(int)bin_chunks.length);
192                     foreach (chunk; bin_chunks) {
193                         _stream.write(cast(ulong)chunk.beg);
194                         _stream.write(cast(ulong)chunk.end);
195                     }
196                 }
197             }
198             _stream.write(cast(uint)37450);
199             _stream.write(cast(int)2);
200             _stream.write(cast(ulong)_beg_vo);
201             _stream.write(cast(ulong)_end_vo);
202             _stream.write(cast(ulong)_mapped);
203             _stream.write(cast(ulong)_unmapped);
204 
205             dumpCurrentLinearIndex();
206 
207             // reset data
208             memset(_linear_index.ptr, 0, _linear_index.length * ulong.sizeof);
209             _linear_index_write_length = 0;
210             _chunks = null;
211             _current_chunk_beg = _prev_read.end_virtual_offset;
212 
213             _beg_vo = _end_vo = cast(ulong)_current_chunk_beg;
214             _unmapped = 0;
215             _mapped = 0;
216         }
217 
218         // adds chunk to the current bin (which is determined from _prev_read)
219         void updateChunks() {
220             auto current_chunk_end = _prev_read.end_virtual_offset;
221 
222             auto bin_id = _prev_read.bin.id;
223 
224             if (bin_id !in _chunks)
225                 _chunks[bin_id] = [];
226             auto cs = _chunks[bin_id];
227 
228             bool canMergeWithPreviousChunk() {
229                 assert(cs.length > 0);
230                 auto last_chunk = cs[$ - 1];
231 
232                 if (last_chunk.end.coffset == _current_chunk_beg.coffset)
233                     return true;
234 
235                 return false;
236             }
237 
238             if (cs.length == 0 || !canMergeWithPreviousChunk()) {
239                 auto new_chunk = Chunk(_current_chunk_beg, current_chunk_end);
240                 _chunks[_prev_read.bin.id] ~= new_chunk;
241             } else {
242                 _chunks[_prev_read.bin.id][$ - 1].end = current_chunk_end;
243             }
244 
245             _current_chunk_beg = current_chunk_end;
246         }
247 
248         void checkThatBinIsCorrect(ref BamReadBlock read) {
249             if (!check_bins)
250                 return;
251             auto expected = reg2bin(read.position, 
252                                     read.position + read.basesCovered());
253             enforce(read.bin.id == expected,
254                     "Bin in read with name '" ~ read.name ~ 
255                     "' is set incorrectly (" ~ to!string(read.bin.id) ~ 
256                     " instead of expected " ~ to!string(expected) ~ ")");
257         }
258 
259         void checkThatInputIsSorted(ref BamReadBlock read) {
260             if (_first_read) return;
261             if (read.ref_id == -1) return; // unmapped
262             if (_prev_read.ref_id < read.ref_id) return;
263 
264             enforce(read.ref_id == _prev_read.ref_id && 
265                     read.position >= _prev_read.position,
266                     "BAM file is not coordinate-sorted: " ~
267                     "read '" ~ read.name ~ "' (" ~ read.ref_id.to!string ~ ":" ~ read.position.to!string ~ ")" ~
268                     " must be after read '" ~ _prev_read.name ~ "' (" ~ _prev_read.ref_id.to!string ~ ":" ~ _prev_read.position.to!string ~ ")" ~
269                     "' (at virtual offsets " ~ 
270                     to!string(_prev_read.start_virtual_offset) ~ ", " ~ read.start_virtual_offset.to!string ~ ")");
271         }
272     }
273 
274     ///
275     this(Stream output_stream, int number_of_references) {
276         _stream = new EndianStream(output_stream, Endian.littleEndian);
277         _n_refs = number_of_references;
278 
279         size_t size = BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1;
280         _linear_index = new VirtualOffset[](size);
281 
282         _stream.writeString(BAI_MAGIC);            // write BAI magic string
283         _stream.write(cast(int)_n_refs);           // and number of references
284     }
285 
286     /// Check that bins are correct.
287     bool check_bins;
288 
289     /// Add a read. The reads must be put in coordinate-sorted order.
290     void put(BamReadBlock read) {
291         checkThatInputIsSorted(read);
292         scope(exit) updateMetadata(read);
293 
294         if (read.ref_id < 0)
295             return;
296 
297         // start position is unavailable, skip
298         if (read.position < 0)
299             return;
300 
301         if (_first_read) {
302             updateLastReadInfo(read);
303             _first_read = false;
304             _current_chunk_beg = read.start_virtual_offset;
305 
306             if (read.ref_id > 0)
307                 foreach (i; 0 .. read.ref_id)
308                     writeEmptyReference();
309 
310             return;
311         }
312 
313         checkThatBinIsCorrect(read);
314 
315         // new reference, so write data for previous one(s)
316         if (read.ref_id > _prev_read.ref_id) {
317             updateLinearIndex();
318             updateChunks();
319             dumpCurrentReference();
320             
321             foreach (i; _prev_read.ref_id + 1 .. read.ref_id)
322                 writeEmptyReference();
323         }
324 
325         if (read.ref_id == _prev_read.ref_id) {
326             updateLinearIndex();
327 
328             if (read.bin.id != _prev_read.bin.id)
329                 updateChunks();
330         }
331 
332         updateLastReadInfo(read);
333     }
334 
335     /// Closes the stream
336     void finish() {
337         if (!_first_read) { // at least one was processed
338             assert(_prev_read.ref_id >= 0);
339             updateLinearIndex();
340             updateChunks();
341             dumpCurrentReference();
342         }
343 
344         // _prev_read.ref_id == -1 if all are unmapped
345         foreach (i; _prev_read.ref_id + 1 .. _n_refs)
346             writeEmptyReference();
347 
348         _stream.write(cast(ulong)_no_coord);
349         _stream.close();
350     }
351 }
352 
353 /// Writes BAM index to the $(D stream)
354 ///
355 /// Accepts optional $(D progressBarFunc)
356 void createIndex(BamReader bam, Stream stream, bool check_bins=false,
357                  void delegate(lazy float p) progressBarFunc=null)
358 {
359     auto n_refs = cast(int)bam.reference_sequences.length;
360     auto index_builder = IndexBuilder(stream, n_refs);
361     index_builder.check_bins = check_bins;
362     auto reads = bam.readsWithProgress!withOffsets(progressBarFunc);
363     foreach (read; reads)
364         index_builder.put(read);
365     index_builder.finish();
366 }