btllib
Loading...
Searching...
No Matches
nthash_kmer.hpp
1#pragma once
2
3#include <array>
4#include <cstdint>
5#include <cstring>
6#include <deque>
7#include <memory>
8#include <string>
9#include <vector>
10
11#include <btllib/hashing_internals.hpp>
12#include <btllib/status.hpp>
13
14namespace btllib::hashing_internals {
15
22inline uint64_t
23base_forward_hash(const char* seq, unsigned k)
24{
25 uint64_t h_val = 0;
26 for (unsigned i = 0; i < k - 3; i += 4) {
27 h_val = srol(h_val, 4);
28 uint8_t loc = 0;
29 loc += 64 * CONVERT_TAB[(unsigned char)seq[i]]; // NOLINT
30 loc += 16 * CONVERT_TAB[(unsigned char)seq[i + 1]]; // NOLINT
31 loc += 4 * CONVERT_TAB[(unsigned char)seq[i + 2]];
32 loc += CONVERT_TAB[(unsigned char)seq[i + 3]];
33 h_val ^= TETRAMER_TAB[loc];
34 }
35 const unsigned remainder = k % 4;
36 if (remainder > 0) {
37 h_val = srol(h_val, remainder);
38 }
39 if (remainder == 3) {
40 uint8_t trimer_loc = 0;
41 trimer_loc += 16 * CONVERT_TAB[(unsigned char)seq[k - 3]]; // NOLINT
42 trimer_loc += 4 * CONVERT_TAB[(unsigned char)seq[k - 2]];
43 trimer_loc += CONVERT_TAB[(unsigned char)seq[k - 1]];
44 h_val ^= TRIMER_TAB[trimer_loc];
45 } else if (remainder == 2) {
46 uint8_t dimer_loc = 0;
47 dimer_loc += 4 * CONVERT_TAB[(unsigned char)seq[k - 2]];
48 dimer_loc += CONVERT_TAB[(unsigned char)seq[k - 1]];
49 h_val ^= DIMER_TAB[dimer_loc];
50 } else if (remainder == 1) {
51 h_val ^= SEED_TAB[(unsigned char)seq[k - 1]];
52 }
53 return h_val;
54}
55
65inline uint64_t
66next_forward_hash(uint64_t fh_val,
67 unsigned k,
68 unsigned char char_out,
69 unsigned char char_in)
70{
71 uint64_t h_val = srol(fh_val);
72 h_val ^= SEED_TAB[char_in];
73 h_val ^= srol_table(char_out, k);
74 return h_val;
75}
76
85inline uint64_t
86prev_forward_hash(uint64_t fh_val,
87 unsigned k,
88 unsigned char char_out,
89 unsigned char char_in)
90{
91 uint64_t h_val = fh_val ^ srol_table(char_in, k);
92 h_val ^= SEED_TAB[char_out];
93 h_val = sror(h_val);
94 return h_val;
95}
96
104inline uint64_t
105base_reverse_hash(const char* seq, unsigned k)
106{
107 uint64_t h_val = 0;
108 const unsigned remainder = k % 4;
109 if (remainder == 3) {
110 uint8_t trimer_loc = 0;
111 trimer_loc += 16 * RC_CONVERT_TAB[(unsigned char)seq[k - 1]]; // NOLINT
112 trimer_loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[k - 2]];
113 trimer_loc += RC_CONVERT_TAB[(unsigned char)seq[k - 3]];
114 h_val ^= TRIMER_TAB[trimer_loc];
115 } else if (remainder == 2) {
116 uint8_t dimer_loc = 0;
117 dimer_loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[k - 1]];
118 dimer_loc += RC_CONVERT_TAB[(unsigned char)seq[k - 2]];
119 h_val ^= DIMER_TAB[dimer_loc];
120 } else if (remainder == 1) {
121 h_val ^= SEED_TAB[(unsigned char)seq[k - 1] & CP_OFF];
122 }
123 for (int i = (int)(k - remainder) - 1; i >= 3; i -= 4) {
124 h_val = srol(h_val, 4);
125 uint8_t loc = 0;
126 loc += 64 * RC_CONVERT_TAB[(unsigned char)seq[i]]; // NOLINT
127 loc += 16 * RC_CONVERT_TAB[(unsigned char)seq[i - 1]]; // NOLINT
128 loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[i - 2]];
129 loc += RC_CONVERT_TAB[(unsigned char)seq[i - 3]];
130 h_val ^= TETRAMER_TAB[loc];
131 }
132 return h_val;
133}
134
145inline uint64_t
146next_reverse_hash(uint64_t rh_val,
147 unsigned k,
148 unsigned char char_out,
149 unsigned char char_in)
150{
151 uint64_t h_val = rh_val ^ srol_table(char_in & CP_OFF, k);
152 h_val ^= SEED_TAB[char_out & CP_OFF];
153 h_val = sror(h_val);
154 return h_val;
155}
156
165inline uint64_t
166prev_reverse_hash(uint64_t rh_val,
167 unsigned k,
168 unsigned char char_out,
169 unsigned char char_in)
170{
171 uint64_t h_val = srol(rh_val);
172 h_val ^= SEED_TAB[char_in & CP_OFF];
173 h_val ^= srol_table(char_out & CP_OFF, k);
174 return h_val;
175}
176
191inline void
192sub_hash(uint64_t fh_val,
193 uint64_t rh_val,
194 const char* kmer_seq,
195 const std::vector<unsigned>& positions,
196 const std::vector<unsigned char>& new_bases,
197 unsigned k,
198 unsigned m,
199 uint64_t* h_val)
200{
201 uint64_t b_val = 0;
202
203 for (size_t i = 0; i < positions.size(); i++) {
204 const auto pos = positions[i];
205 const auto new_base = new_bases[i];
206
207 fh_val ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
208 fh_val ^= srol_table(new_base, k - 1 - pos);
209
210 rh_val ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
211 rh_val ^= srol_table(new_base & CP_OFF, pos);
212 }
213
214 b_val = canonical(fh_val, rh_val);
215 extend_hashes(b_val, k, m, h_val);
216}
217
218} // namespace btllib::hashing_internals
219
220namespace btllib {
221
222using hashing_internals::base_forward_hash;
223using hashing_internals::base_reverse_hash;
224using hashing_internals::extend_hashes;
225using hashing_internals::next_forward_hash;
226using hashing_internals::next_reverse_hash;
227using hashing_internals::prev_forward_hash;
228using hashing_internals::prev_reverse_hash;
229using hashing_internals::SEED_N;
230using hashing_internals::SEED_TAB;
231using hashing_internals::sub_hash;
232
237{
238
239public:
248 NtHash(const char* seq,
249 size_t seq_len,
250 hashing_internals::NUM_HASHES_TYPE num_hashes,
251 hashing_internals::K_TYPE k,
252 size_t pos = 0)
253 : seq(seq)
254 , seq_len(seq_len)
255 , num_hashes(num_hashes)
256 , k(k)
257 , pos(pos)
258 , initialized(false)
259 , hash_arr(new uint64_t[num_hashes])
260 {
261 check_error(k == 0, "NtHash: k must be greater than 0");
262 check_error(this->seq_len < k,
263 "NtHash: sequence length (" + std::to_string(this->seq_len) +
264 ") is smaller than k (" + std::to_string(k) + ")");
265 check_error(pos > this->seq_len - k,
266 "NtHash: passed position (" + std::to_string(pos) +
267 ") is larger than sequence length (" +
268 std::to_string(this->seq_len) + ")");
269 }
270
278 NtHash(const std::string& seq,
279 hashing_internals::NUM_HASHES_TYPE num_hashes,
280 hashing_internals::K_TYPE k,
281 size_t pos = 0)
282 : NtHash(seq.data(), seq.size(), num_hashes, k, pos)
283 {
284 }
285
286 NtHash(const NtHash& obj)
287 : seq(obj.seq)
288 , seq_len(obj.seq_len)
289 , num_hashes(obj.num_hashes)
290 , k(obj.k)
291 , pos(obj.pos)
292 , initialized(obj.initialized)
293 , fwd_hash(obj.fwd_hash)
294 , rev_hash(obj.rev_hash)
295 , hash_arr(new uint64_t[obj.num_hashes])
296 {
297 std::memcpy(
298 hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
299 }
300
301 NtHash(NtHash&&) = default;
302
315 bool roll()
316 {
317 if (!initialized) {
318 return init();
319 }
320 if (pos >= seq_len - k) {
321 return false;
322 }
323 if (hashing_internals::SEED_TAB[(unsigned char)seq[pos + k]] ==
324 hashing_internals::SEED_N) {
325 pos += k;
326 return init();
327 }
328 fwd_hash = next_forward_hash(fwd_hash, k, seq[pos], seq[pos + k]);
329 rev_hash = next_reverse_hash(rev_hash, k, seq[pos], seq[pos + k]);
330 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
331 ++pos;
332 return true;
333 }
334
340 {
341 if (!initialized) {
342 return init();
343 }
344 if (pos == 0) {
345 return false;
346 }
347 if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N && pos >= k) {
348 pos -= k;
349 return init();
350 }
351 if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N) {
352 return false;
353 }
354 fwd_hash = prev_forward_hash(fwd_hash, k, seq[pos + k - 1], seq[pos - 1]);
355 rev_hash = prev_reverse_hash(rev_hash, k, seq[pos + k - 1], seq[pos - 1]);
356 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
357 --pos;
358 return true;
359 }
360
367 bool peek()
368 {
369 if (pos >= seq_len - k) {
370 return false;
371 }
372 return peek(seq[pos + k]);
373 }
374
380 {
381 if (pos == 0) {
382 return false;
383 }
384 return peek_back(seq[pos - 1]);
385 }
386
393 bool peek(char char_in)
394 {
395 if (!initialized) {
396 return init();
397 }
398 if (SEED_TAB[(unsigned char)char_in] == SEED_N) {
399 return false;
400 }
401 const uint64_t fwd = next_forward_hash(fwd_hash, k, seq[pos], char_in);
402 const uint64_t rev = next_reverse_hash(rev_hash, k, seq[pos], char_in);
403 extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
404 return true;
405 }
406
411 bool peek_back(char char_in)
412 {
413 if (!initialized) {
414 return init();
415 }
416 if (SEED_TAB[(unsigned char)char_in] == SEED_N) {
417 return false;
418 }
419 const unsigned char char_out = seq[pos + k - 1];
420 const uint64_t fwd = prev_forward_hash(fwd_hash, k, char_out, char_in);
421 const uint64_t rev = prev_reverse_hash(rev_hash, k, char_out, char_in);
422 extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
423 return true;
424 }
425
426 void sub(const std::vector<unsigned>& positions,
427 const std::vector<unsigned char>& new_bases)
428 {
429 sub_hash(fwd_hash,
430 rev_hash,
431 seq + pos,
432 positions,
433 new_bases,
434 get_k(),
435 get_hash_num(),
436 hash_arr.get());
437 }
438
443 const uint64_t* hashes() const { return hash_arr.get(); }
444
450 size_t get_pos() const { return pos; }
451
456 hashing_internals::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
457
462 hashing_internals::K_TYPE get_k() const { return k; }
463
468 uint64_t get_forward_hash() const { return fwd_hash; }
469
474 uint64_t get_reverse_hash() const { return rev_hash; }
475
476private:
477 const char* seq;
478 const size_t seq_len;
479 hashing_internals::NUM_HASHES_TYPE num_hashes;
480 hashing_internals::K_TYPE k;
481 size_t pos;
482 bool initialized;
483 uint64_t fwd_hash = 0;
484 uint64_t rev_hash = 0;
485 std::unique_ptr<uint64_t[]> hash_arr;
486
491 bool init()
492 {
493 bool has_n = true;
494 while (pos <= seq_len - k + 1 && has_n) {
495 has_n = false;
496 for (unsigned i = 0; i < k && pos <= seq_len - k + 1; i++) {
497 if (SEED_TAB[(unsigned char)seq[pos + k - i - 1]] == SEED_N) {
498 pos += k - i;
499 has_n = true;
500 }
501 }
502 }
503 if (pos > seq_len - k) {
504 return false;
505 }
506 fwd_hash = base_forward_hash(seq + pos, k);
507 rev_hash = base_reverse_hash(seq + pos, k);
508 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
509 initialized = true;
510 return true;
511 }
512};
513
521{
522
523public:
532 BlindNtHash(const std::string& seq,
533 hashing_internals::NUM_HASHES_TYPE num_hashes,
534 hashing_internals::K_TYPE k,
535 long pos = 0)
536 : seq(seq.data() + pos, seq.data() + pos + k)
537 , num_hashes(num_hashes)
538 , pos(pos)
539 , fwd_hash(base_forward_hash(seq.data(), k))
540 , rev_hash(base_reverse_hash(seq.data(), k))
541 , hash_arr(new uint64_t[num_hashes])
542 {
543 check_error(k == 0, "BlindNtHash: k must be greater than 0");
544 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
545 }
546
547 BlindNtHash(const BlindNtHash& obj)
548 : seq(obj.seq)
549 , num_hashes(obj.num_hashes)
550 , pos(obj.pos)
551 , fwd_hash(obj.fwd_hash)
552 , rev_hash(obj.rev_hash)
553 , hash_arr(new uint64_t[obj.num_hashes])
554 {
555 std::memcpy(
556 hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
557 }
558
559 BlindNtHash(BlindNtHash&&) = default;
560
567 void roll(char char_in)
568 {
569 fwd_hash = next_forward_hash(fwd_hash, seq.size(), seq.front(), char_in);
570 rev_hash = next_reverse_hash(rev_hash, seq.size(), seq.front(), char_in);
571 extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
572 seq.pop_front();
573 seq.push_back(char_in);
574 ++pos;
575 }
576
580 void roll_back(char char_in)
581 {
582 fwd_hash = prev_forward_hash(fwd_hash, seq.size(), seq.back(), char_in);
583 rev_hash = prev_reverse_hash(rev_hash, seq.size(), seq.back(), char_in);
584 extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
585 seq.pop_back();
586 seq.push_front(char_in);
587 --pos;
588 }
589
593 void peek(char char_in)
594 {
595 const hashing_internals::K_TYPE k = seq.size();
596 const uint64_t fwd = next_forward_hash(fwd_hash, k, seq.front(), char_in);
597 const uint64_t rev = next_reverse_hash(rev_hash, k, seq.front(), char_in);
598 extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
599 }
600
604 void peek_back(char char_in)
605 {
606 const hashing_internals::K_TYPE k = seq.size();
607 const uint64_t fwd = prev_forward_hash(fwd_hash, k, seq.back(), char_in);
608 const uint64_t rev = prev_reverse_hash(rev_hash, k, seq.back(), char_in);
609 extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
610 }
611
616 const uint64_t* hashes() const { return hash_arr.get(); }
617
623 long get_pos() const { return pos; }
624
629 hashing_internals::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
630
635 hashing_internals::K_TYPE get_k() const { return seq.size(); }
636
641 uint64_t get_forward_hash() const { return fwd_hash; }
642
647 uint64_t get_reverse_hash() const { return rev_hash; }
648
649private:
650 std::deque<char> seq;
651 hashing_internals::NUM_HASHES_TYPE num_hashes;
652 long pos;
653 uint64_t fwd_hash;
654 uint64_t rev_hash;
655 std::unique_ptr<uint64_t[]> hash_arr;
656};
657
658} // namespace btllib
Definition nthash_kmer.hpp:521
long get_pos() const
Definition nthash_kmer.hpp:623
uint64_t get_forward_hash() const
Definition nthash_kmer.hpp:641
hashing_internals::K_TYPE get_k() const
Definition nthash_kmer.hpp:635
void peek(char char_in)
Definition nthash_kmer.hpp:593
uint64_t get_reverse_hash() const
Definition nthash_kmer.hpp:647
void roll(char char_in)
Definition nthash_kmer.hpp:567
const uint64_t * hashes() const
Definition nthash_kmer.hpp:616
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition nthash_kmer.hpp:629
BlindNtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, long pos=0)
Definition nthash_kmer.hpp:532
void peek_back(char char_in)
Definition nthash_kmer.hpp:604
void roll_back(char char_in)
Definition nthash_kmer.hpp:580
Definition nthash_kmer.hpp:237
bool peek()
Definition nthash_kmer.hpp:367
bool peek_back()
Definition nthash_kmer.hpp:379
bool peek(char char_in)
Definition nthash_kmer.hpp:393
bool roll()
Definition nthash_kmer.hpp:315
const uint64_t * hashes() const
Definition nthash_kmer.hpp:443
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition nthash_kmer.hpp:456
hashing_internals::K_TYPE get_k() const
Definition nthash_kmer.hpp:462
NtHash(const char *seq, size_t seq_len, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_kmer.hpp:248
bool peek_back(char char_in)
Definition nthash_kmer.hpp:411
uint64_t get_reverse_hash() const
Definition nthash_kmer.hpp:474
size_t get_pos() const
Definition nthash_kmer.hpp:450
NtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_kmer.hpp:278
bool roll_back()
Definition nthash_kmer.hpp:339
uint64_t get_forward_hash() const
Definition nthash_kmer.hpp:468
Definition aahash.hpp:12
void check_error(bool condition, const std::string &msg)