1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2014    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 /// $(P Each BAM file contains reads aligned to different reference sequences.)
25 /// $(P These sequences have unique identifiers in BAM file, starting from 0.
26 /// Unmapped reads are associated with id = -1.)
27 /// $(P If BAI file is available, fast region queries are available, that is,
28 /// getting all reads that overlap given region. This is achieved via $(D opSlice) method.)
29 ///
30 /// Example:
31 /// -----------------------------
32 /// import bio.bam.reader, std.stdio;
33 /// ...
34 /// auto bam = new BamReader("file.bam");
35 /// auto refseq = bam["chr17"];
36 /// writeln(refseq.name, " - length ", refseq.length);
37 /// foreach (read; refseq[1234 .. 5678])
38 ///     if (read.cigar.length > 1)
39 ///         writeln(read.name, " ", read.cigarString());
40 /// -----------------------------
41 module bio.bam.reference;
42 
43 public import bio.bam.referenceinfo;
44 
45 import bio.bam.readrange;
46 import bio.bam.region;
47 import bio.bam.randomaccessmanager;
48 import bio.core.bgzf.virtualoffset;
49 
50 import undead.stream;
51 import std.exception;
52 import std.array;
53 
54 ///
55 struct ReferenceSequence {
56     private int _ref_id;
57    
58     /// Name
59     string name() @property const {
60         return _info.name;
61     }
62 
63     /// Length in base pairs
64     int length() @property const {
65         return _info.length;
66     }
67 
68     /// Reference ID
69     int id() @property const {
70         return _ref_id;
71     }
72 
73     /// Get alignments overlapping [start, end) region.
74     /// $(BR)
75     /// Coordinates are 0-based.
76     auto opSlice(uint start, uint end) {
77         enforce(start < end, "start must be less than end");
78         enforce(_manager !is null, "random access is not available");
79         enforce(_ref_id >= 0, "invalid reference id");
80         return _manager.getReads(BamRegion(cast(uint)_ref_id, start, end));
81     }
82 
83     /// Get all alignments for this reference
84     auto opSlice() {
85         return opSlice(0, length);
86     }
87 
88     private alias typeof(opSlice().front) Read;
89     private Read _first_read() @property {
90         return opSlice().front.dup;
91     }
92 
93     /// Virtual offset at which reads, aligned to this reference, start in BAM file.
94     /// If there are no reads aligned to this reference, returns virtual
95     /// offset of the EOF block if it's presented, or the end of file.
96     bio.core.bgzf.virtualoffset.VirtualOffset startVirtualOffset() {
97         auto reads = opSlice();
98         if (reads.empty) {
99             return _manager.eofVirtualOffset();
100         }
101         return reads.front.start_virtual_offset;
102     }
103 
104     /// Virtual offset before which reads, aligned to this reference, stop.
105     /// If there are no reads aligned to this reference, returns virtual
106     /// offset of the EOF block if it's presented, or the end of file.
107     bio.core.bgzf.virtualoffset.VirtualOffset endVirtualOffset() {
108 
109         if (opSlice().empty) {
110             return _manager.eofVirtualOffset();
111         }
112 
113         auto ioffsets = _manager.getBai().indices[_ref_id].ioffsets[];
114         assert(ioffsets.length > 0);
115 
116         // Try to get startVirtualOffset of the next reference presented in the file.
117         for (uint r = _ref_id + 1; r < _manager.getBai().indices.length; ++r) {
118             auto reads = _manager.getReads(BamRegion(r, 0, uint.max));
119             if (reads.empty) {
120                 continue;
121             } else {
122                 return reads.front.start_virtual_offset;
123             }
124         }
125 
126         // However, this approach fails if there are unmapped reads coming after
127         // this reference. We cannot just return _manager.eofVirtualOffset.
128 
129         auto last_offset = ioffsets[$ - 1];
130         auto stream = _manager.createStreamStartingFrom(last_offset);
131         auto last_few_reads = bamReadRange!withOffsets(stream, null);
132 
133         VirtualOffset result;
134         assert(!last_few_reads.empty);
135         foreach (read; last_few_reads) {
136             if (read.ref_id == -1) break;
137             result = read.end_virtual_offset;
138         }
139 
140         return result;
141     }
142  
143     /// First position on the reference overlapped by reads (0-based)
144     /// $(BR)
145     /// Returns -1 if set of reads is empty.
146     int firstPosition() {
147         auto reads = opSlice();
148         if (reads.empty) {
149             return -1;
150         }
151         return reads.front.position;
152     }
153    
154     /// Last position on the reference overlapped by reads (0-based)
155     /// $(BR)
156     /// Returns -1 if set of reads is empty.
157     int lastPosition() {
158         // The key idea is
159         //  1) use last offset from linear index
160         //  2) loop through all remaining reads starting from there
161 
162         auto ioffsets = _manager.getBai().indices[_ref_id].ioffsets[];
163 
164         long index = ioffsets.length - 1;
165 
166         debug {
167             int reads_processed = 0;
168         }
169 
170         while (index >= 0) {
171             auto offset = ioffsets[cast(size_t)index];
172 
173             auto stream = _manager.createStreamStartingFrom(offset);
174             auto reads = bamReadRange(stream, null);
175 
176             int last_position = int.min;
177 
178             foreach (read; reads) {
179 
180                  debug {
181                      reads_processed += 1;
182                  }
183 
184                  if (read.ref_id != _ref_id) {
185                      break;
186                  }
187                 
188                  if (read.position == -1) {
189                      continue;
190                  }
191 
192                  auto end_pos = read.position + read.basesCovered();
193                  if (end_pos > last_position)
194                      last_position = end_pos;
195             }
196 
197             if (last_position != int.min) {
198                 debug {
199                     import std.stdio;
200                     stderr.writeln("[debug] ReferenceSequence.lastPosition() processed ",
201                                    reads_processed, " reads");
202                 }
203                 return last_position - 1;
204             }
205 
206             --index;
207         }
208 
209         return firstPosition();
210     }
211 
212     this(RandomAccessManager manager, int ref_id, ReferenceSequenceInfo info) {
213         _manager = manager;
214         _ref_id = ref_id;
215         _info = info;
216     }
217 
218 private:
219     RandomAccessManager _manager;
220     ReferenceSequenceInfo _info;
221 }