1 /* 2 This file is part of BioD. 3 Copyright (C) 2012-2014 Artem Tarasov <lomereiter@gmail.com> 4 5 Permission is hereby granted, free of charge, to any person obtaining a 6 copy of this software and associated documentation files (the "Software"), 7 to deal in the Software without restriction, including without limitation 8 the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 and/or sell copies of the Software, and to permit persons to whom the 10 Software is furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 DEALINGS IN THE SOFTWARE. 22 23 */ 24 /// BAM records may carry arbitrary information in tags. 25 /// $(BR) 26 /// $(D Value) type provides convenient way to work with this information. 27 /// 28 /// Example: 29 /// -------------------------------- 30 /// import bio.std.hts.bam.reader, bio.std.hts.bam.tagvalue; 31 /// ... 32 /// auto bam = new BamReader("file.bam"); 33 /// Value v = bam.reads.front["MD"]; 34 /// assert(v.is_string); 35 /// v = 5; 36 /// assert(v.is_signed); // because 5 is of type int which is signed 37 /// assert(v == "5"); // converted to string and then compared 38 /// v = "abc"; 39 /// assert(v.is_string); 40 /// v = [1, 2, 3]; // integer and float arrays are supported 41 /// assert(v.is_numeric_array); 42 /// v = [1.5f, 2.3f, 17.0f]; // double[] arrays must be converted to float[] 43 /// assert(v.is_numeric_array); 44 /// v = 5.6; 45 /// assert(v.is_float); 46 /// v = -17; 47 /// assert(v.is_signed); 48 /// ---------------------------------- 49 module bio.std.hts.bam.tagvalue; 50 51 public import std.conv; 52 import std.typetuple; 53 import std.exception; 54 import std.format; 55 import std.array; 56 import bio.core.utils.format; 57 58 import bio.std.hts.thirdparty.msgpack; 59 60 struct CharToType(char c, T) { 61 /** symbol */ 62 enum ch = c; 63 64 /** type which corresponds to the symbol 65 according to SAM/BAM specification 66 */ 67 alias T ValueType; 68 } 69 70 /** 71 Thrown in case of unrecognized tag type 72 */ 73 class UnknownTagTypeException : Exception { 74 this(string msg) { super(msg); } 75 } 76 77 78 alias TypeTuple!(CharToType!('A', char), 79 CharToType!('c', byte), 80 CharToType!('C', ubyte), 81 CharToType!('s', short), 82 CharToType!('S', ushort), 83 CharToType!('i', int), 84 CharToType!('I', uint), 85 CharToType!('f', float)) PrimitiveTagValueTypes; 86 87 alias TypeTuple!(CharToType!('Z', string), 88 CharToType!('H', string)) StringTagValueTypes; 89 90 alias TypeTuple!(CharToType!('c', byte), 91 CharToType!('C', ubyte), 92 CharToType!('s', short), 93 CharToType!('S', ushort), 94 CharToType!('i', int), 95 CharToType!('I', uint), 96 CharToType!('f', float)) ArrayElementTagValueTypes; 97 98 /* 99 Useful in TagStorage implementations, for skipping elements 100 101 Params: 102 c = primitive type identifier 103 104 Returns: size of corresponding type in bytes 105 */ 106 uint charToSizeof(char c) { 107 string charToSizeofHelper() { 108 char[] cases; 109 foreach (c2t; PrimitiveTagValueTypes) { 110 cases ~= "case '"~c2t.ch~"':"~ 111 " return "~to!string(c2t.ValueType.sizeof)~";".dup; 112 } 113 return "switch (c) { " ~ cases.idup ~ 114 " default: " ~ 115 " throw new UnknownTagTypeException(to!string(c));"~ 116 "}"; 117 } 118 mixin(charToSizeofHelper()); 119 } 120 121 /* 122 Pair of type and its ubyte identifier. 123 124 (Currently, ubyte is enough, but that might change in the future.) 125 */ 126 struct TypeId(T, ubyte id) { 127 enum Id = id; 128 alias T Type; 129 } 130 131 /* 132 Structure of type identifier: 133 134 0 1 135 136 primitive array/string 137 something null/nothing numeric string 138 numeric char 0 0 Z H 139 integer float 0 [see left 0 0 140 unsigned signed 0 0 branch] 0 0 141 [ size in bytes] [size in bytes] 0 [element size] 1 1 142 143 (TypeId >> 5) == elementType.sizeof 144 145 */ 146 alias TypeTuple!(TypeId!(char, 0b001_00_1_00), 147 148 TypeId!(ubyte, 0b001_0_0000), 149 TypeId!(ushort, 0b010_0_0000), 150 TypeId!(uint, 0b100_0__0__0__0__0), 151 /* Let's take 4 u i n s p 152 uint as an n n u o r 153 example b s t m m i 154 y i e e e m 155 t g g r t i 156 e n e i h t 157 s e r c i i 158 d n v 159 g e 160 */ 161 162 163 TypeId!(byte, 0b001_1_0000), 164 TypeId!(short, 0b010_1_0000), 165 TypeId!(int, 0b100_1_0000), 166 167 TypeId!(float, 0b100_01_000), 168 169 TypeId!(ubyte[], 0b001_000_01), 170 TypeId!(ushort[], 0b010_000_01), 171 TypeId!(uint[], 0b100_000_01), 172 173 TypeId!(byte[], 0b001_100_01), 174 TypeId!(short[], 0b010_100_01), 175 TypeId!(int[], 0b100_100_01), 176 177 TypeId!(float[], 0b100_01_001), 178 179 TypeId!(string, 0b001_00_101), 180 TypeId!(string, 0b001_01_101), 181 TypeId!(typeof(null), 0b0000_0010)) 182 TypeIdMap; 183 184 private immutable hexStringTag = 0b001_01_101; 185 186 private template GetType(U) { 187 alias U.Type GetType; 188 } 189 190 /// Get tag for type T. 191 /// 192 /// Useful for comparison with tag field of Value struct. 193 /// 194 /// Example: 195 /// ----------------------------------- 196 /// Value v = "zzz"; 197 /// assert(v.tag == GetTypeId!string); 198 /// ----------------------------------- 199 template GetTypeId(T) { 200 /// 201 enum GetTypeId = TypeIdMap[staticIndexOf!(T, staticMap!(GetType, TypeIdMap))].Id; 202 } 203 204 string generateUnion() { 205 char[] u = "union U {".dup; 206 foreach (t; PrimitiveTagValueTypes) { 207 u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup; 208 } 209 foreach (t; StringTagValueTypes) { 210 u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup; 211 } 212 foreach (t; ArrayElementTagValueTypes) { 213 u ~= t.ValueType.stringof ~ "[] " ~ 'B' ~ t.ch ~ ";".dup; 214 } 215 u ~= "}; U u;".dup; 216 return u.idup; 217 } 218 219 template ArrayOf(T) { 220 alias T[] ArrayOf; 221 } 222 223 string injectOpAssign() { 224 char[] cs; 225 226 foreach (t; PrimitiveTagValueTypes) { 227 cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ " value) {" ~ 228 " this.u." ~ t.ch ~ " = value;" ~ 229 " this._tag = " ~ to!string(GetTypeId!(t.ValueType)) ~ ";" ~ 230 " this.bam_typeid = '" ~ t.ch ~ "';" ~ 231 "}"; 232 } 233 234 cs ~= "final void opAssign(string value) {" ~ 235 " this.u.Z = value;" ~ 236 " this._tag = " ~ to!string(GetTypeId!string) ~ ";" ~ 237 " this.bam_typeid = 'Z';" ~ 238 "}"; 239 240 foreach (t; ArrayElementTagValueTypes) { 241 cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ "[] value) {" ~ 242 " this.u.B" ~ t.ch ~ " = value;" ~ 243 " this._tag = " ~ to!string(GetTypeId!(ArrayOf!(t.ValueType))) ~ ";" ~ 244 " this.bam_typeid = '" ~ t.ch ~ "';" ~ 245 "}"; 246 } 247 248 return cs.idup; 249 } 250 251 string injectOpCast() { 252 char[] cs = "static if".dup; 253 254 string injectSwitchPrimitive(string requested_type) 255 { 256 char[] cs = `switch (_tag) {`.dup; 257 258 foreach (t2; PrimitiveTagValueTypes) { 259 cs ~= `case GetTypeId!`~t2.ValueType.stringof~`: `~ 260 ` return to!T(u.`~t2.ch~`);`.dup; 261 } 262 263 cs ~= ` default: throw new ConvException("Cannot convert Value to `~ 264 requested_type~`");`~ 265 `}`; 266 return cs.idup; 267 } 268 269 string injectSwitchArrayElement(string requested_type) 270 { 271 char[] cs = `switch (_tag) {`.dup; 272 273 foreach (t2; ArrayElementTagValueTypes) { 274 cs ~= `case GetTypeId!(`~t2.ValueType.stringof~`[]): `~ 275 ` return to!T(u.B`~t2.ch~`);`.dup; 276 } 277 278 cs ~= ` default: throw new ConvException("Cannot convert Value to `~ 279 requested_type~`");`~ 280 `}`; 281 return cs.idup; 282 } 283 284 foreach (t; TypeTuple!(byte, ubyte, short, ushort, int, uint, 285 char, float, double, real, long, ulong)) 286 { 287 cs ~= `(is(T == `~t.stringof~`)) {`~ 288 injectSwitchPrimitive(t.stringof)~ 289 `} else static if`.dup; 290 } 291 292 foreach (t; ArrayElementTagValueTypes) { 293 cs ~= `(is(T == ` ~ t.ValueType.stringof ~ `[])) {` ~ 294 injectSwitchArrayElement(t.ValueType.stringof ~ "[]")~ 295 `} else static if `; 296 } 297 298 cs ~= `(is(T == string)) {` ~ 299 ` if (is_string) {` ~ 300 ` return bam_typeid == 'Z' ? u.Z : u.H;`~ 301 ` } else if (is_integer || is_float || is_character) {`~ 302 ` `~injectSwitchPrimitive("string")~ 303 ` } else {`~ 304 injectSwitchArrayElement("string")~ 305 ` }`~ 306 `}`.dup; 307 308 return "final T opCast(T)() const {" ~ cs.idup ~ "}"; 309 } 310 311 /** 312 Struct for representing tag values. 313 314 Tagged union, allows to store 315 8/16/32-bit integers, floats, chars, strings, 316 and arrays of integers/floats. 317 */ 318 struct Value { 319 320 /* 321 Notice that having union first allows to do simple casts, 322 without using opCast(). That's a bit hackish but 323 allows for better speed. 324 */ 325 private mixin(generateUnion()); 326 327 /** 328 If this is an array, one of [cCsSiIf]. 329 Otherwise, one of [AcCsSiIfZH] 330 331 See SAM/BAM specification for details. 332 */ 333 public char bam_typeid; 334 335 /* 336 WARNING: 337 338 Currently, type identifier for (u)int requires 8 bits. 339 Fortunately, SAM/BAM specification doesn't use bigger integer types. 340 However, in case of need to extend the hierarchy, the type 341 should be changed from ubyte to something bigger. 342 */ 343 ubyte _tag; 344 345 /// Designates the type of currently stored value. 346 /// 347 /// Supposed to be used externally for checking type with GetTypeId. 348 ubyte tag() @property const { 349 return _tag; 350 } 351 352 mixin(injectOpAssign()); 353 mixin(injectOpCast()); 354 355 /// 356 final void opAssign(Value v) { 357 bam_typeid = v.bam_typeid; 358 _tag = v._tag; 359 u = v.u; 360 } 361 362 /// ditto 363 final void opAssign(typeof(null) n) { 364 _tag = GetTypeId!(typeof(null)); 365 } 366 367 /// 368 final bool opEquals(T)(const T val) { 369 try { 370 return to!T(this) == val; 371 } catch (ConvException e) { 372 return false; 373 } 374 } 375 376 /// 377 string toString() const { 378 return opCast!string(); 379 } 380 381 /// 382 this(T)(T value) { 383 opAssign(value); 384 } 385 386 /// sets 'H' tag instead of default 'Z'. Is not expected to be used much. 387 void setHexadecimalFlag() { 388 389 enforce(this.is_string); 390 391 bam_typeid = 'H'; 392 _tag = hexStringTag; 393 394 if (_tag != 0b111) { 395 u.H = u.Z; 396 } 397 } 398 399 /// Holds $(D null). Represents non-existing tag. Such values are used to remove tags. 400 bool is_nothing() @property const { return _tag == GetTypeId!(typeof(null)); } 401 402 /// char 403 bool is_character() @property const { return _tag == GetTypeId!char; } 404 405 /// float 406 bool is_float() @property const { return _tag == GetTypeId!float; } 407 408 /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[]/float[] 409 bool is_numeric_array() @property const { return (_tag & 0b111) == 0b001; } 410 411 /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[] 412 bool is_array_of_integers() @property const { return (_tag & 0b1111) == 0b0001; } 413 414 /// float[] 415 bool is_array_of_floats() @property const { return (_tag & 0b1111) == 0b1001; } 416 417 /// ubyte/byte/ushort/short/uint/int 418 bool is_integer() @property const { return (_tag & 0b1111) == 0; } 419 420 /// ubyte/ushort/uint 421 bool is_unsigned() @property const { return (_tag & 0b11111) == 0; } 422 423 /// byte/short/int 424 bool is_signed() @property const { return (_tag & 0b11111) == 0b10000; } 425 426 /// 'Z' or 'H' tag 427 bool is_string() @property const { return (_tag & 0b111) == 0b101; } 428 429 /// 'H' tag 430 bool is_hexadecimal_string() @property const { return (_tag & 0b1101) == 0b1101; } 431 432 /// Serializes value in MessagePack format 433 public void toMsgpack(Packer)(ref Packer packer) const { 434 switch (_tag) { 435 case GetTypeId!byte: packer.pack(*cast(byte*)(&u)); break; 436 case GetTypeId!ubyte: packer.pack(*cast(ubyte*)(&u)); break; 437 case GetTypeId!short: packer.pack(*cast(short*)(&u)); break; 438 case GetTypeId!ushort: packer.pack(*cast(ushort*)(&u)); break; 439 case GetTypeId!int: packer.pack(*cast(int*)(&u)); break; 440 case GetTypeId!uint: packer.pack(*cast(uint*)(&u)); break; 441 442 case GetTypeId!float: packer.pack(*cast(float*)(&u)); break; 443 case GetTypeId!string: packer.pack(*cast(char[]*)(&u)); break; 444 case hexStringTag: packer.pack(*cast(char[]*)(&u)); break; 445 case GetTypeId!char: packer.pack(*cast(ubyte*)(&u)); break; 446 447 case GetTypeId!(byte[]): packer.pack(*cast(byte[]*)(&u)); break; 448 case GetTypeId!(ubyte[]): packer.pack(*cast(ubyte[]*)(&u)); break; 449 case GetTypeId!(short[]): packer.pack(*cast(short[]*)(&u)); break; 450 case GetTypeId!(ushort[]): packer.pack(*cast(ushort[]*)(&u)); break; 451 case GetTypeId!(int[]): packer.pack(*cast(int[]*)(&u)); break; 452 case GetTypeId!(uint[]): packer.pack(*cast(uint[]*)(&u)); break; 453 case GetTypeId!(float[]): packer.pack(*cast(float[]*)(&u)); break; 454 455 case GetTypeId!(typeof(null)): packer.pack(null); break; 456 default: break; 457 } 458 } 459 460 /// SAM representation 461 string toSam()() const { 462 auto w = appender!(char[])(); 463 toSam((const(char)[] s) { w.put(s); }); 464 return cast(string)w.data; 465 } 466 467 /// ditto 468 void toSam(Sink)(auto ref Sink sink) const 469 if (isSomeSink!Sink) 470 { 471 if (is_integer) { 472 sink.write("i:"); 473 switch (_tag) { 474 case GetTypeId!byte: sink.write(*cast(byte*)(&u)); break; 475 case GetTypeId!ubyte: sink.write(*cast(ubyte*)(&u)); break; 476 case GetTypeId!short: sink.write(*cast(short*)(&u)); break; 477 case GetTypeId!ushort: sink.write(*cast(ushort*)(&u)); break; 478 case GetTypeId!int: sink.write(*cast(int*)(&u)); break; 479 case GetTypeId!uint: sink.write(*cast(uint*)(&u)); break; 480 default: break; 481 } 482 } else if (is_numeric_array) { 483 sink.write("B:"); 484 sink.write(bam_typeid); 485 sink.write(','); 486 switch (_tag) { 487 case GetTypeId!(byte[]): sink.writeArray(*cast(byte[]*)(&u), ','); break; 488 case GetTypeId!(ubyte[]): sink.writeArray(*cast(ubyte[]*)(&u), ','); break; 489 case GetTypeId!(short[]): sink.writeArray(*cast(short[]*)(&u), ','); break; 490 case GetTypeId!(ushort[]): sink.writeArray(*cast(ushort[]*)(&u), ','); break; 491 case GetTypeId!(int[]): sink.writeArray(*cast(int[]*)(&u), ','); break; 492 case GetTypeId!(uint[]): sink.writeArray(*cast(uint[]*)(&u), ','); break; 493 case GetTypeId!(float[]): sink.writeArray(*cast(float[]*)(&u), ','); break; 494 default: break; 495 } 496 } else { 497 switch (_tag) { 498 case GetTypeId!float: sink.write("f:"); sink.write(*cast(float*)(&u)); break; 499 case GetTypeId!string: sink.write("Z:"); sink.write(*cast(const(char)[]*)(&u)); break; 500 case hexStringTag: sink.write("H:"); sink.write(*cast(const(char)[]*)(&u)); break; 501 case GetTypeId!char: sink.write("A:"); sink.write(*cast(char*)(&u)); break; 502 default: break; 503 } 504 } 505 } 506 507 /// JSON representation 508 string toJson()() const { 509 auto w = appender!(char[])(); 510 toJson((const(char)[] s) { w.put(s); }); 511 return cast(string)w.data; 512 } 513 514 /// ditto 515 void toJson(Sink)(auto ref Sink sink) const 516 if (isSomeSink!Sink) 517 { 518 switch (_tag) { 519 case GetTypeId!byte: sink.writeJson(*cast(byte*)(&u)); break; 520 case GetTypeId!ubyte: sink.writeJson(*cast(ubyte*)(&u)); break; 521 case GetTypeId!short: sink.writeJson(*cast(short*)(&u)); break; 522 case GetTypeId!ushort: sink.writeJson(*cast(ushort*)(&u)); break; 523 case GetTypeId!int: sink.writeJson(*cast(int*)(&u)); break; 524 case GetTypeId!uint: sink.writeJson(*cast(uint*)(&u)); break; 525 case GetTypeId!(byte[]): sink.writeJson(*cast(byte[]*)(&u)); break; 526 case GetTypeId!(ubyte[]): sink.writeJson(*cast(ubyte[]*)(&u)); break; 527 case GetTypeId!(short[]): sink.writeJson(*cast(short[]*)(&u)); break; 528 case GetTypeId!(ushort[]): sink.writeJson(*cast(ushort[]*)(&u)); break; 529 case GetTypeId!(int[]): sink.writeJson(*cast(int[]*)(&u)); break; 530 case GetTypeId!(uint[]): sink.writeJson(*cast(uint[]*)(&u)); break; 531 case GetTypeId!(float[]): sink.writeJson(*cast(float[]*)(&u)); break; 532 case GetTypeId!float: sink.writeJson(*cast(float*)(&u)); break; 533 case GetTypeId!string: sink.writeJson(*cast(string*)(&u)); break; 534 case hexStringTag: sink.writeJson(*cast(string*)(&u)); break; 535 case GetTypeId!char: sink.writeJson(*cast(char*)(&u)); break; 536 default: break; 537 } 538 } 539 } 540 541 Value readValueFromArray(char type, const(ubyte)[] bytes, ref size_t offset) { 542 string readValueArrayTypeHelper() { 543 char[] cases; 544 foreach (c2t; ArrayElementTagValueTypes) { 545 cases ~= 546 "case '"~c2t.ch~"':".dup~ 547 " auto begin = offset;"~ 548 " auto end = offset + length * "~c2t.ValueType.stringof~".sizeof;"~ 549 " offset = end;"~ 550 " return Value(cast("~c2t.ValueType.stringof~"[])(bytes[begin .. end]));"; 551 } 552 return to!string("switch (elem_type) {" ~ cases ~ 553 " default: throw new UnknownTagTypeException(to!string(elem_type));"~ 554 "}"); 555 } 556 557 string readValuePrimitiveTypeHelper() { 558 char[] cases; 559 foreach (c2t; PrimitiveTagValueTypes) { 560 cases ~= "case '"~c2t.ch~"':"~ 561 " auto p = bytes.ptr + offset;"~ 562 " auto value = *(cast("~c2t.ValueType.stringof~"*)p);"~ 563 " offset += value.sizeof;"~ 564 " return Value(value);".dup; 565 } 566 return to!string("switch (type) {" ~ cases ~ 567 " default: throw new UnknownTagTypeException(to!string(type));"~ 568 "}"); 569 } 570 571 if (type == 'Z' || type == 'H') { 572 auto begin = offset; 573 while (bytes[offset++] != 0) {} 574 // return string with stripped '\0' 575 auto v = Value(cast(string)bytes[begin .. offset - 1]); 576 if (type == 'H') { 577 v.setHexadecimalFlag(); 578 } 579 return v; 580 } else if (type == 'B') { 581 char elem_type = cast(char)bytes[offset++]; 582 uint length = *(cast(uint*)(bytes.ptr + offset)); 583 offset += uint.sizeof; 584 mixin(readValueArrayTypeHelper()); 585 } else { 586 mixin(readValuePrimitiveTypeHelper()); 587 } 588 }