1 module bio.core.kmer;
2 
3 import bio.core.base;
4 import std.range;
5 
6 /// Represents k-mer of ACGT bases of length no more than 32.
7 struct KMer(uint K) 
8     if (K <= 32)
9 {
10     private ulong _id;
11 
12     static Base5 code2base(int code) {
13         return Base5("ACGT"[code]);
14     }
15 
16     static int char2code(char base) {
17         switch (base) {
18             case 'A': return 0;
19             case 'C': return 1;
20             case 'G': return 2;
21             case 'T': return 3;
22             default: return -1;
23         }
24     }
25 
26     /// Unique ID
27     ulong id() @property const {
28         return _id;
29     }
30 
31     /// Construct by ID
32     this(S)(S id) 
33         if (is(S == ulong))
34     {
35         _id = id;
36     }
37        
38     /// Construct from sequence. Takes bases from the provided sequence
39     /// until K symbols 'A/C/G/T' are found. That is, 'N' and other ambiguous
40     /// bases are skipped.
41     ///
42     /// If sequence does not contain at least K bases 'A/C/G/T', the result of
43     /// operation is undefined.
44     this(S)(S sequence) 
45         if (isInputRange!S) 
46     {
47         size_t i = 0;
48         foreach (nuc; sequence) {
49             _id <<= 2;
50             ++i;
51             switch (cast(char)nuc) {
52                 case 'A':
53                     break;
54                 case 'C':
55                     _id += 1;
56                     break;
57                 case 'G':
58                     _id += 2;
59                     break;
60                 case 'T':
61                     _id += 3;
62                     break;
63                 default:
64                     _id >>= 2;
65                     --i;
66                     break;
67             }
68 
69             if (i == K)
70                 break;
71         }
72     }
73 
74     struct KMerSequence {
75         this(ulong number) {
76             _n = number;
77         }
78 
79         private ulong _n;
80         private size_t _len = K;
81 
82         bool empty() @property const { return _len == 0; }
83         void popFront() { --_len; }
84         void popBack() { --_len; _n >>= 2; }
85 
86         Base5 opIndex(size_t i) const {
87             return code2base((_n >> (2 * (_len - i - 1))) & 3);
88         }
89 
90         size_t length() @property const { return _len; }
91         Base5 front() @property const { return opIndex(0); }
92         Base5 back() @property const { return opIndex(_len - 1); }
93         KMerSequence save() @property const { 
94             KMerSequence _seq = void;
95             _seq._n = _n;
96             _seq._len = _len;
97             return _seq;
98         }
99     }
100 
101     /// Sequence corresponding to the k-mer
102     KMerSequence sequence() @property const {
103         return KMerSequence(_id);
104     }
105 }
106 
107 unittest {
108     import std.algorithm;
109     auto kmer = KMer!10("AACGTACGTG");
110     assert(equal(kmer.sequence, "AACGTACGTG"));
111 
112     assert(KMer!5(KMer!5(0b1011001001UL).sequence).id == 0b1011001001UL);
113 }