1 /*
2     This file is part of BioD.
3     Copyright (C) 2012-2014    Artem Tarasov <lomereiter@gmail.com>
4 
5     Permission is hereby granted, free of charge, to any person obtaining a
6     copy of this software and associated documentation files (the "Software"),
7     to deal in the Software without restriction, including without limitation
8     the rights to use, copy, modify, merge, publish, distribute, sublicense,
9     and/or sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following conditions:
11     
12     The above copyright notice and this permission notice shall be included in
13     all copies or substantial portions of the Software.
14     
15     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21     DEALINGS IN THE SOFTWARE.
22 
23 */
24 /// BAM records may carry arbitrary information in tags.
25 /// $(BR)
26 /// $(D Value) type provides convenient way to work with this information.
27 ///
28 /// Example:
29 /// --------------------------------
30 /// import bio.std.hts.bam.reader, bio.std.hts.bam.tagvalue;
31 /// ...
32 /// auto bam = new BamReader("file.bam");
33 /// Value v = bam.reads.front["MD"];
34 /// assert(v.is_string);
35 /// v = 5;
36 /// assert(v.is_signed);     // because 5 is of type int which is signed
37 /// assert(v == "5");        // converted to string and then compared
38 /// v = "abc";
39 /// assert(v.is_string);
40 /// v = [1, 2, 3];           // integer and float arrays are supported
41 /// assert(v.is_numeric_array);
42 /// v = [1.5f, 2.3f, 17.0f]; // double[] arrays must be converted to float[]
43 /// assert(v.is_numeric_array);
44 /// v = 5.6;
45 /// assert(v.is_float);
46 /// v = -17;
47 /// assert(v.is_signed);
48 /// ----------------------------------
49 module bio.std.hts.bam.tagvalue;
50 
51 public import std.conv;
52 import std.typetuple;
53 import std.exception;
54 import std.format;
55 import std.array;
56 import bio.core.utils.format;
57 
58 import bio.std.hts.thirdparty.msgpack;
59 
60 struct CharToType(char c, T) {
61     /** symbol */
62     enum ch = c;
63 
64     /** type which corresponds to the symbol
65         according to SAM/BAM specification 
66     */
67     alias T ValueType;    
68 }
69 
70 /**
71   Thrown in case of unrecognized tag type
72  */
73 class UnknownTagTypeException : Exception {
74     this(string msg) { super(msg); }
75 }
76 
77 
78 alias TypeTuple!(CharToType!('A', char),
79                  CharToType!('c', byte),  
80                  CharToType!('C', ubyte),
81                  CharToType!('s', short), 
82                  CharToType!('S', ushort),
83                  CharToType!('i', int),   
84                  CharToType!('I', uint),
85                  CharToType!('f', float))       PrimitiveTagValueTypes;
86 
87 alias TypeTuple!(CharToType!('Z', string), 
88                  CharToType!('H', string))      StringTagValueTypes;
89 
90 alias TypeTuple!(CharToType!('c', byte),  
91                  CharToType!('C', ubyte),
92                  CharToType!('s', short), 
93                  CharToType!('S', ushort),
94                  CharToType!('i', int),   
95                  CharToType!('I', uint),
96                  CharToType!('f', float))       ArrayElementTagValueTypes;
97 
98 /*
99   Useful in TagStorage implementations, for skipping elements
100 
101   Params:
102     c =         primitive type identifier
103 
104   Returns: size of corresponding type in bytes
105 */
106 uint charToSizeof(char c) {
107     string charToSizeofHelper() {
108         char[] cases;
109         foreach (c2t; PrimitiveTagValueTypes) {
110             cases ~= "case '"~c2t.ch~"':"~
111                      "  return "~to!string(c2t.ValueType.sizeof)~";".dup;
112         }
113         return "switch (c) { " ~ cases.idup ~
114                "  default: " ~
115                "    throw new UnknownTagTypeException(to!string(c));"~ 
116                "}";
117     }
118     mixin(charToSizeofHelper());
119 }
120 
121 /*
122   Pair of type and its ubyte identifier. 
123 
124   (Currently, ubyte is enough, but that might change in the future.)
125 */
126 struct TypeId(T, ubyte id) {
127     enum Id = id;
128     alias T Type;
129 }
130 
131 /*
132   Structure of type identifier:
133 
134                               0                                   1   
135 
136                              primitive                          array/string
137                  something         null/nothing             numeric         string
138             numeric      char           0                   0              Z       H   
139     integer        float                0              [see left           0       0 
140 unsigned   signed       0               0               branch]            0       0
141  [ size in bytes]  [size in bytes]      0            [element size]        1       1
142 
143      (TypeId >> 5) == elementType.sizeof
144 
145 */
146 alias TypeTuple!(TypeId!(char,     0b001_00_1_00),
147         
148                  TypeId!(ubyte,    0b001_0_0000), 
149                  TypeId!(ushort,   0b010_0_0000), 
150                  TypeId!(uint,     0b100_0__0__0__0__0), 
151 /* Let's take                         4  u  i  n  s  p                  
152    uint as an                            n  n  u  o  r                  
153    example                            b  s  t  m  m  i                  
154                                       y  i  e  e  e  m
155                                       t  g  g  r  t  i
156                                       e  n  e  i  h  t
157                                       s  e  r  c  i  i
158                                          d        n  v
159                                                   g  e
160 */   
161  
162 
163                  TypeId!(byte,     0b001_1_0000),
164                  TypeId!(short,    0b010_1_0000), 
165                  TypeId!(int,      0b100_1_0000), 
166 
167                  TypeId!(float,    0b100_01_000),
168 
169                  TypeId!(ubyte[],  0b001_000_01),
170                  TypeId!(ushort[], 0b010_000_01),
171                  TypeId!(uint[],   0b100_000_01),
172 
173                  TypeId!(byte[],   0b001_100_01),
174                  TypeId!(short[],  0b010_100_01),
175                  TypeId!(int[],    0b100_100_01),
176 
177                  TypeId!(float[],  0b100_01_001),
178 
179                  TypeId!(string,   0b001_00_101),
180                  TypeId!(string,   0b001_01_101),
181                  TypeId!(typeof(null), 0b0000_0010))
182     TypeIdMap;
183 
184 private immutable hexStringTag = 0b001_01_101;
185 
186 private template GetType(U) {
187     alias U.Type GetType;
188 }
189 
190 /// Get tag for type T.
191 ///
192 /// Useful for comparison with tag field of Value struct.
193 /// 
194 /// Example:
195 /// -----------------------------------
196 /// Value v = "zzz";
197 /// assert(v.tag == GetTypeId!string);
198 /// -----------------------------------
199 template GetTypeId(T) {
200     ///
201     enum GetTypeId = TypeIdMap[staticIndexOf!(T, staticMap!(GetType, TypeIdMap))].Id;
202 }
203 
204 string generateUnion() {
205     char[] u = "union U {".dup;
206     foreach (t; PrimitiveTagValueTypes) {
207         u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup;
208     }
209     foreach (t; StringTagValueTypes) {
210         u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup;
211     }
212     foreach (t; ArrayElementTagValueTypes) {
213         u ~= t.ValueType.stringof ~ "[] " ~ 'B' ~ t.ch ~ ";".dup;
214     }
215     u ~= "}; U u;".dup;
216     return u.idup;
217 }
218 
219 template ArrayOf(T) {
220     alias T[] ArrayOf;
221 }
222 
223 string injectOpAssign() {
224     char[] cs;
225 
226     foreach (t; PrimitiveTagValueTypes) {
227         cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ " value) {" ~
228               "  this.u." ~ t.ch ~ " = value;" ~
229               "  this._tag = " ~ to!string(GetTypeId!(t.ValueType)) ~ ";" ~
230               "  this.bam_typeid = '" ~ t.ch ~ "';" ~
231               "}";
232     }
233 
234     cs ~= "final void opAssign(string value) {" ~
235           "  this.u.Z = value;" ~
236           "  this._tag = " ~ to!string(GetTypeId!string) ~ ";" ~
237           "  this.bam_typeid = 'Z';" ~
238           "}";
239 
240     foreach (t; ArrayElementTagValueTypes) {
241         cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ "[] value) {" ~
242               "  this.u.B" ~ t.ch ~ " = value;" ~
243               "  this._tag = " ~ to!string(GetTypeId!(ArrayOf!(t.ValueType))) ~ ";" ~
244               "  this.bam_typeid = '" ~ t.ch ~ "';" ~
245               "}";
246     }
247 
248     return cs.idup;
249 }
250 
251 string injectOpCast() {
252     char[] cs = "static if".dup;
253 
254     string injectSwitchPrimitive(string requested_type) 
255     {
256         char[] cs = `switch (_tag) {`.dup;
257               
258         foreach (t2; PrimitiveTagValueTypes) {
259             cs ~= `case GetTypeId!`~t2.ValueType.stringof~`: `~
260                   `    return to!T(u.`~t2.ch~`);`.dup;
261         }
262 
263         cs ~= `    default: throw new ConvException("Cannot convert Value to `~
264                                                      requested_type~`");`~
265               `}`;
266         return cs.idup;
267     }
268 
269     string injectSwitchArrayElement(string requested_type) 
270     {
271         char[] cs = `switch (_tag) {`.dup;
272               
273         foreach (t2; ArrayElementTagValueTypes) {
274             cs ~= `case GetTypeId!(`~t2.ValueType.stringof~`[]): `~
275                   `    return to!T(u.B`~t2.ch~`);`.dup;
276         }
277 
278         cs ~= `    default: throw new ConvException("Cannot convert Value to `~
279                                                      requested_type~`");`~
280               `}`;
281         return cs.idup;
282     }
283 
284     foreach (t; TypeTuple!(byte, ubyte, short, ushort, int, uint,
285                            char, float, double, real, long, ulong))
286     {
287         cs ~= `(is(T == `~t.stringof~`)) {`~
288               injectSwitchPrimitive(t.stringof)~
289               `} else static if`.dup;
290     }
291 
292     foreach (t; ArrayElementTagValueTypes) {
293         cs ~= `(is(T == ` ~ t.ValueType.stringof ~ `[])) {` ~
294               injectSwitchArrayElement(t.ValueType.stringof ~ "[]")~
295               `} else static if `;
296     }
297 
298     cs ~= `(is(T == string)) {` ~
299           `  if (is_string) {` ~
300           `    return bam_typeid == 'Z' ? u.Z : u.H;`~
301           `  } else if (is_integer || is_float || is_character) {`~
302           `    `~injectSwitchPrimitive("string")~
303           `  } else {`~
304                  injectSwitchArrayElement("string")~
305           `  }`~
306           `}`.dup;
307 
308     return "final T opCast(T)() const {" ~ cs.idup ~ "}";
309 }
310 
311 /**
312   Struct for representing tag values. 
313 
314   Tagged union, allows to store 
315   8/16/32-bit integers, floats, chars, strings, 
316   and arrays of integers/floats.
317 */
318 struct Value {
319 
320     /*
321       Notice that having union first allows to do simple casts,
322       without using opCast(). That's a bit hackish but
323       allows for better speed.
324      */
325     private mixin(generateUnion());
326 
327     /**
328       If this is an array, one of [cCsSiIf].
329       Otherwise, one of [AcCsSiIfZH]
330 
331       See SAM/BAM specification for details.
332     */
333     public char bam_typeid;
334 
335     /*
336                                     WARNING:
337 
338     Currently, type identifier for (u)int requires 8 bits.
339     Fortunately, SAM/BAM specification doesn't use bigger integer types.
340     However, in case of need to extend the hierarchy, the type
341     should be changed from ubyte to something bigger. 
342     */
343     ubyte _tag;
344 
345     /// Designates the type of currently stored value.
346     ///
347     /// Supposed to be used externally for checking type with GetTypeId.
348     ubyte tag() @property const {
349         return _tag;
350     }
351 
352     mixin(injectOpAssign());
353     mixin(injectOpCast());
354 
355     ///
356     final void opAssign(Value v) {
357         bam_typeid = v.bam_typeid;
358         _tag = v._tag;
359         u = v.u;
360     }
361 
362     /// ditto
363     final void opAssign(typeof(null) n) {
364         _tag = GetTypeId!(typeof(null));
365     }
366 
367     ///
368     final bool opEquals(T)(const T val) {
369         try {
370             return to!T(this) == val;
371         } catch (ConvException e) {
372             return false;
373         }
374     }
375 
376     ///
377     string toString() const {
378         return opCast!string();
379     }
380 
381     ///
382     this(T)(T value) {
383         opAssign(value);
384     }
385  
386     /// sets 'H' tag instead of default 'Z'. Is not expected to be used much.
387     void setHexadecimalFlag() {
388 
389         enforce(this.is_string);
390       
391         bam_typeid = 'H';
392         _tag = hexStringTag;
393 
394         if (_tag != 0b111) { 
395             u.H = u.Z;
396         }
397     }
398 
399     /// Holds $(D null). Represents non-existing tag. Such values are used to remove tags.
400     bool is_nothing() @property const { return _tag == GetTypeId!(typeof(null)); }
401 
402     /// char
403     bool is_character() @property const { return _tag == GetTypeId!char; }
404 
405     /// float
406     bool is_float() @property const { return _tag == GetTypeId!float; }
407 
408     /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[]/float[]
409     bool is_numeric_array() @property const { return (_tag & 0b111) == 0b001; }
410 
411     /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[]
412     bool is_array_of_integers() @property const { return (_tag & 0b1111) == 0b0001; }
413 
414     /// float[]
415     bool is_array_of_floats() @property const { return (_tag & 0b1111) == 0b1001; }
416 
417     /// ubyte/byte/ushort/short/uint/int
418     bool is_integer() @property const { return (_tag & 0b1111) == 0; }
419 
420     /// ubyte/ushort/uint
421     bool is_unsigned() @property const { return (_tag & 0b11111) == 0; }
422 
423     /// byte/short/int
424     bool is_signed() @property const { return (_tag & 0b11111) == 0b10000; }
425 
426     /// 'Z' or 'H' tag
427     bool is_string() @property const { return (_tag & 0b111) == 0b101; }
428 
429     /// 'H' tag
430     bool is_hexadecimal_string() @property const { return (_tag & 0b1101) == 0b1101; }
431 
432     /// Serializes value in MessagePack format
433     public void toMsgpack(Packer)(ref Packer packer) const {
434         switch (_tag) {
435             case GetTypeId!byte: packer.pack(*cast(byte*)(&u)); break;
436             case GetTypeId!ubyte: packer.pack(*cast(ubyte*)(&u)); break;
437             case GetTypeId!short: packer.pack(*cast(short*)(&u)); break;
438             case GetTypeId!ushort: packer.pack(*cast(ushort*)(&u)); break;
439             case GetTypeId!int: packer.pack(*cast(int*)(&u)); break;
440             case GetTypeId!uint: packer.pack(*cast(uint*)(&u)); break;
441 
442             case GetTypeId!float: packer.pack(*cast(float*)(&u)); break;
443             case GetTypeId!string: packer.pack(*cast(char[]*)(&u)); break;
444             case hexStringTag: packer.pack(*cast(char[]*)(&u)); break;
445             case GetTypeId!char: packer.pack(*cast(ubyte*)(&u)); break;
446 
447             case GetTypeId!(byte[]): packer.pack(*cast(byte[]*)(&u)); break;
448             case GetTypeId!(ubyte[]): packer.pack(*cast(ubyte[]*)(&u)); break;
449             case GetTypeId!(short[]): packer.pack(*cast(short[]*)(&u)); break;
450             case GetTypeId!(ushort[]): packer.pack(*cast(ushort[]*)(&u)); break;
451             case GetTypeId!(int[]): packer.pack(*cast(int[]*)(&u)); break;
452             case GetTypeId!(uint[]): packer.pack(*cast(uint[]*)(&u)); break;
453             case GetTypeId!(float[]): packer.pack(*cast(float[]*)(&u)); break;
454 
455             case GetTypeId!(typeof(null)): packer.pack(null); break;
456             default: break;
457         }
458     }
459 
460     /// SAM representation
461     string toSam()() const {
462         auto w = appender!(char[])();
463         toSam((const(char)[] s) { w.put(s); });
464         return cast(string)w.data;
465     }
466 
467     /// ditto
468     void toSam(Sink)(auto ref Sink sink) const 
469         if (isSomeSink!Sink)
470     {
471         if (is_integer) {
472             sink.write("i:");
473             switch (_tag) {
474                 case GetTypeId!byte: sink.write(*cast(byte*)(&u)); break;
475                 case GetTypeId!ubyte: sink.write(*cast(ubyte*)(&u)); break;
476                 case GetTypeId!short: sink.write(*cast(short*)(&u)); break;
477                 case GetTypeId!ushort: sink.write(*cast(ushort*)(&u)); break;
478                 case GetTypeId!int: sink.write(*cast(int*)(&u)); break;
479                 case GetTypeId!uint: sink.write(*cast(uint*)(&u)); break;
480                 default: break;
481             }
482         } else if (is_numeric_array) {
483             sink.write("B:");
484             sink.write(bam_typeid);
485             sink.write(',');
486             switch (_tag) {
487                 case GetTypeId!(byte[]): sink.writeArray(*cast(byte[]*)(&u), ','); break;
488                 case GetTypeId!(ubyte[]): sink.writeArray(*cast(ubyte[]*)(&u), ','); break;
489                 case GetTypeId!(short[]): sink.writeArray(*cast(short[]*)(&u), ','); break;
490                 case GetTypeId!(ushort[]): sink.writeArray(*cast(ushort[]*)(&u), ','); break;
491                 case GetTypeId!(int[]): sink.writeArray(*cast(int[]*)(&u), ','); break;
492                 case GetTypeId!(uint[]): sink.writeArray(*cast(uint[]*)(&u), ','); break;
493                 case GetTypeId!(float[]): sink.writeArray(*cast(float[]*)(&u), ','); break;
494                 default: break;
495             }
496         } else {
497             switch (_tag) {
498                 case GetTypeId!float: sink.write("f:"); sink.write(*cast(float*)(&u)); break;
499                 case GetTypeId!string: sink.write("Z:"); sink.write(*cast(const(char)[]*)(&u)); break;
500                 case hexStringTag: sink.write("H:"); sink.write(*cast(const(char)[]*)(&u)); break;
501                 case GetTypeId!char: sink.write("A:"); sink.write(*cast(char*)(&u)); break;
502                 default: break;
503             }
504         }
505     }
506 
507     /// JSON representation
508     string toJson()() const {
509         auto w = appender!(char[])();
510         toJson((const(char)[] s) { w.put(s); });
511         return cast(string)w.data;
512     }
513 
514     /// ditto
515     void toJson(Sink)(auto ref Sink sink) const 
516         if (isSomeSink!Sink)
517     {
518         switch (_tag) {
519             case GetTypeId!byte: sink.writeJson(*cast(byte*)(&u)); break;
520             case GetTypeId!ubyte: sink.writeJson(*cast(ubyte*)(&u)); break;
521             case GetTypeId!short: sink.writeJson(*cast(short*)(&u)); break;
522             case GetTypeId!ushort: sink.writeJson(*cast(ushort*)(&u)); break;
523             case GetTypeId!int: sink.writeJson(*cast(int*)(&u)); break;
524             case GetTypeId!uint: sink.writeJson(*cast(uint*)(&u)); break;
525             case GetTypeId!(byte[]): sink.writeJson(*cast(byte[]*)(&u)); break;
526             case GetTypeId!(ubyte[]): sink.writeJson(*cast(ubyte[]*)(&u)); break;
527             case GetTypeId!(short[]): sink.writeJson(*cast(short[]*)(&u)); break;
528             case GetTypeId!(ushort[]): sink.writeJson(*cast(ushort[]*)(&u)); break;
529             case GetTypeId!(int[]): sink.writeJson(*cast(int[]*)(&u)); break;
530             case GetTypeId!(uint[]): sink.writeJson(*cast(uint[]*)(&u)); break;
531             case GetTypeId!(float[]): sink.writeJson(*cast(float[]*)(&u)); break;
532             case GetTypeId!float: sink.writeJson(*cast(float*)(&u)); break;
533             case GetTypeId!string: sink.writeJson(*cast(string*)(&u)); break;
534             case hexStringTag: sink.writeJson(*cast(string*)(&u)); break;
535             case GetTypeId!char: sink.writeJson(*cast(char*)(&u)); break;
536             default: break;
537         }
538     }
539 }
540 
541 Value readValueFromArray(char type, const(ubyte)[] bytes, ref size_t offset) {
542     string readValueArrayTypeHelper() {
543         char[] cases;
544         foreach (c2t; ArrayElementTagValueTypes) {
545             cases ~=
546             "case '"~c2t.ch~"':".dup~
547             "  auto begin = offset;"~
548             "  auto end = offset + length * "~c2t.ValueType.stringof~".sizeof;"~
549             "  offset = end;"~
550             "  return Value(cast("~c2t.ValueType.stringof~"[])(bytes[begin .. end]));";
551         }
552         return to!string("switch (elem_type) {" ~ cases ~
553                "  default: throw new UnknownTagTypeException(to!string(elem_type));"~
554                "}");
555     }
556 
557     string readValuePrimitiveTypeHelper() {
558         char[] cases;
559         foreach (c2t; PrimitiveTagValueTypes) {
560             cases ~= "case '"~c2t.ch~"':"~
561                      "  auto p = bytes.ptr + offset;"~
562                      "  auto value = *(cast("~c2t.ValueType.stringof~"*)p);"~
563                      "  offset += value.sizeof;"~
564                      "  return Value(value);".dup;
565         }
566         return to!string("switch (type) {" ~ cases ~
567                "  default: throw new UnknownTagTypeException(to!string(type));"~
568                "}");
569     }
570 
571     if (type == 'Z' || type == 'H') {
572         auto begin = offset;
573         while (bytes[offset++] != 0) {}
574         // return string with stripped '\0'
575         auto v = Value(cast(string)bytes[begin .. offset - 1]);
576         if (type == 'H') {
577             v.setHexadecimalFlag();
578         }
579         return v;
580     } else if (type == 'B') {
581         char elem_type = cast(char)bytes[offset++];
582         uint length = *(cast(uint*)(bytes.ptr + offset));
583         offset += uint.sizeof;
584         mixin(readValueArrayTypeHelper());
585     } else {
586         mixin(readValuePrimitiveTypeHelper());
587     }
588 }