11#include <btllib/hashing_internals.hpp>
12#include <btllib/status.hpp>
14namespace btllib::hashing_internals {
23base_forward_hash(
const char* seq,
unsigned k)
26 for (
unsigned i = 0; i < k - 3; i += 4) {
27 h_val = srol(h_val, 4);
29 loc += 64 * CONVERT_TAB[(
unsigned char)seq[i]];
30 loc += 16 * CONVERT_TAB[(
unsigned char)seq[i + 1]];
31 loc += 4 * CONVERT_TAB[(
unsigned char)seq[i + 2]];
32 loc += CONVERT_TAB[(
unsigned char)seq[i + 3]];
33 h_val ^= TETRAMER_TAB[loc];
35 const unsigned remainder = k % 4;
37 h_val = srol(h_val, remainder);
40 uint8_t trimer_loc = 0;
41 trimer_loc += 16 * CONVERT_TAB[(
unsigned char)seq[k - 3]];
42 trimer_loc += 4 * CONVERT_TAB[(
unsigned char)seq[k - 2]];
43 trimer_loc += CONVERT_TAB[(
unsigned char)seq[k - 1]];
44 h_val ^= TRIMER_TAB[trimer_loc];
45 }
else if (remainder == 2) {
46 uint8_t dimer_loc = 0;
47 dimer_loc += 4 * CONVERT_TAB[(
unsigned char)seq[k - 2]];
48 dimer_loc += CONVERT_TAB[(
unsigned char)seq[k - 1]];
49 h_val ^= DIMER_TAB[dimer_loc];
50 }
else if (remainder == 1) {
51 h_val ^= SEED_TAB[(
unsigned char)seq[k - 1]];
66next_forward_hash(uint64_t fh_val,
68 unsigned char char_out,
69 unsigned char char_in)
71 uint64_t h_val = srol(fh_val);
72 h_val ^= SEED_TAB[char_in];
73 h_val ^= srol_table(char_out, k);
86prev_forward_hash(uint64_t fh_val,
88 unsigned char char_out,
89 unsigned char char_in)
91 uint64_t h_val = fh_val ^ srol_table(char_in, k);
92 h_val ^= SEED_TAB[char_out];
105base_reverse_hash(
const char* seq,
unsigned k)
108 const unsigned remainder = k % 4;
109 if (remainder == 3) {
110 uint8_t trimer_loc = 0;
111 trimer_loc += 16 * RC_CONVERT_TAB[(
unsigned char)seq[k - 1]];
112 trimer_loc += 4 * RC_CONVERT_TAB[(
unsigned char)seq[k - 2]];
113 trimer_loc += RC_CONVERT_TAB[(
unsigned char)seq[k - 3]];
114 h_val ^= TRIMER_TAB[trimer_loc];
115 }
else if (remainder == 2) {
116 uint8_t dimer_loc = 0;
117 dimer_loc += 4 * RC_CONVERT_TAB[(
unsigned char)seq[k - 1]];
118 dimer_loc += RC_CONVERT_TAB[(
unsigned char)seq[k - 2]];
119 h_val ^= DIMER_TAB[dimer_loc];
120 }
else if (remainder == 1) {
121 h_val ^= SEED_TAB[(
unsigned char)seq[k - 1] & CP_OFF];
123 for (
int i = (
int)(k - remainder) - 1; i >= 3; i -= 4) {
124 h_val = srol(h_val, 4);
126 loc += 64 * RC_CONVERT_TAB[(
unsigned char)seq[i]];
127 loc += 16 * RC_CONVERT_TAB[(
unsigned char)seq[i - 1]];
128 loc += 4 * RC_CONVERT_TAB[(
unsigned char)seq[i - 2]];
129 loc += RC_CONVERT_TAB[(
unsigned char)seq[i - 3]];
130 h_val ^= TETRAMER_TAB[loc];
146next_reverse_hash(uint64_t rh_val,
148 unsigned char char_out,
149 unsigned char char_in)
151 uint64_t h_val = rh_val ^ srol_table(char_in & CP_OFF, k);
152 h_val ^= SEED_TAB[char_out & CP_OFF];
166prev_reverse_hash(uint64_t rh_val,
168 unsigned char char_out,
169 unsigned char char_in)
171 uint64_t h_val = srol(rh_val);
172 h_val ^= SEED_TAB[char_in & CP_OFF];
173 h_val ^= srol_table(char_out & CP_OFF, k);
192sub_hash(uint64_t fh_val,
194 const char* kmer_seq,
195 const std::vector<unsigned>& positions,
196 const std::vector<unsigned char>& new_bases,
203 for (
size_t i = 0; i < positions.size(); i++) {
204 const auto pos = positions[i];
205 const auto new_base = new_bases[i];
207 fh_val ^= srol_table((
unsigned char)kmer_seq[pos], k - 1 - pos);
208 fh_val ^= srol_table(new_base, k - 1 - pos);
210 rh_val ^= srol_table((
unsigned char)kmer_seq[pos] & CP_OFF, pos);
211 rh_val ^= srol_table(new_base & CP_OFF, pos);
214 b_val = canonical(fh_val, rh_val);
215 extend_hashes(b_val, k, m, h_val);
222using hashing_internals::base_forward_hash;
223using hashing_internals::base_reverse_hash;
224using hashing_internals::extend_hashes;
225using hashing_internals::next_forward_hash;
226using hashing_internals::next_reverse_hash;
227using hashing_internals::prev_forward_hash;
228using hashing_internals::prev_reverse_hash;
229using hashing_internals::SEED_N;
230using hashing_internals::SEED_TAB;
231using hashing_internals::sub_hash;
250 hashing_internals::NUM_HASHES_TYPE num_hashes,
251 hashing_internals::K_TYPE k,
255 , num_hashes(num_hashes)
259 , hash_arr(new uint64_t[num_hashes])
261 check_error(k == 0,
"NtHash: k must be greater than 0");
263 "NtHash: sequence length (" + std::to_string(this->seq_len) +
264 ") is smaller than k (" + std::to_string(k) +
")");
266 "NtHash: passed position (" + std::to_string(pos) +
267 ") is larger than sequence length (" +
268 std::to_string(this->seq_len) +
")");
279 hashing_internals::NUM_HASHES_TYPE num_hashes,
280 hashing_internals::K_TYPE k,
282 :
NtHash(seq.data(), seq.size(), num_hashes, k, pos)
288 , seq_len(obj.seq_len)
289 , num_hashes(obj.num_hashes)
292 , initialized(obj.initialized)
293 , fwd_hash(obj.fwd_hash)
294 , rev_hash(obj.rev_hash)
295 , hash_arr(new uint64_t[obj.num_hashes])
298 hash_arr.get(), obj.hash_arr.get(), num_hashes *
sizeof(uint64_t));
320 if (pos >= seq_len - k) {
323 if (hashing_internals::SEED_TAB[(
unsigned char)seq[pos + k]] ==
324 hashing_internals::SEED_N) {
328 fwd_hash = next_forward_hash(fwd_hash, k, seq[pos], seq[pos + k]);
329 rev_hash = next_reverse_hash(rev_hash, k, seq[pos], seq[pos + k]);
330 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
347 if (SEED_TAB[(
unsigned char)seq[pos - 1]] == SEED_N && pos >= k) {
351 if (SEED_TAB[(
unsigned char)seq[pos - 1]] == SEED_N) {
354 fwd_hash = prev_forward_hash(fwd_hash, k, seq[pos + k - 1], seq[pos - 1]);
355 rev_hash = prev_reverse_hash(rev_hash, k, seq[pos + k - 1], seq[pos - 1]);
356 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
369 if (pos >= seq_len - k) {
372 return peek(seq[pos + k]);
398 if (SEED_TAB[(
unsigned char)char_in] == SEED_N) {
401 const uint64_t fwd = next_forward_hash(fwd_hash, k, seq[pos], char_in);
402 const uint64_t rev = next_reverse_hash(rev_hash, k, seq[pos], char_in);
403 extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
416 if (SEED_TAB[(
unsigned char)char_in] == SEED_N) {
419 const unsigned char char_out = seq[pos + k - 1];
420 const uint64_t fwd = prev_forward_hash(fwd_hash, k, char_out, char_in);
421 const uint64_t rev = prev_reverse_hash(rev_hash, k, char_out, char_in);
422 extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
426 void sub(
const std::vector<unsigned>& positions,
427 const std::vector<unsigned char>& new_bases)
443 const uint64_t*
hashes()
const {
return hash_arr.get(); }
456 hashing_internals::NUM_HASHES_TYPE
get_hash_num()
const {
return num_hashes; }
462 hashing_internals::K_TYPE
get_k()
const {
return k; }
478 const size_t seq_len;
479 hashing_internals::NUM_HASHES_TYPE num_hashes;
480 hashing_internals::K_TYPE k;
483 uint64_t fwd_hash = 0;
484 uint64_t rev_hash = 0;
485 std::unique_ptr<uint64_t[]> hash_arr;
494 while (pos <= seq_len - k + 1 && has_n) {
496 for (
unsigned i = 0; i < k && pos <= seq_len - k + 1; i++) {
497 if (SEED_TAB[(
unsigned char)seq[pos + k - i - 1]] == SEED_N) {
503 if (pos > seq_len - k) {
506 fwd_hash = base_forward_hash(seq + pos, k);
507 rev_hash = base_reverse_hash(seq + pos, k);
508 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
533 hashing_internals::NUM_HASHES_TYPE num_hashes,
534 hashing_internals::K_TYPE k,
536 : seq(seq.data() + pos, seq.data() + pos + k)
537 , num_hashes(num_hashes)
539 , fwd_hash(base_forward_hash(seq.data(), k))
540 , rev_hash(base_reverse_hash(seq.data(), k))
541 , hash_arr(new uint64_t[num_hashes])
543 check_error(k == 0,
"BlindNtHash: k must be greater than 0");
544 extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
549 , num_hashes(obj.num_hashes)
551 , fwd_hash(obj.fwd_hash)
552 , rev_hash(obj.rev_hash)
553 , hash_arr(new uint64_t[obj.num_hashes])
556 hash_arr.get(), obj.hash_arr.get(), num_hashes *
sizeof(uint64_t));
569 fwd_hash = next_forward_hash(fwd_hash, seq.size(), seq.front(), char_in);
570 rev_hash = next_reverse_hash(rev_hash, seq.size(), seq.front(), char_in);
571 extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
573 seq.push_back(char_in);
582 fwd_hash = prev_forward_hash(fwd_hash, seq.size(), seq.back(), char_in);
583 rev_hash = prev_reverse_hash(rev_hash, seq.size(), seq.back(), char_in);
584 extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
586 seq.push_front(char_in);
595 const hashing_internals::K_TYPE k = seq.size();
596 const uint64_t fwd = next_forward_hash(fwd_hash, k, seq.front(), char_in);
597 const uint64_t rev = next_reverse_hash(rev_hash, k, seq.front(), char_in);
598 extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
606 const hashing_internals::K_TYPE k = seq.size();
607 const uint64_t fwd = prev_forward_hash(fwd_hash, k, seq.back(), char_in);
608 const uint64_t rev = prev_reverse_hash(rev_hash, k, seq.back(), char_in);
609 extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
616 const uint64_t*
hashes()
const {
return hash_arr.get(); }
629 hashing_internals::NUM_HASHES_TYPE
get_hash_num()
const {
return num_hashes; }
635 hashing_internals::K_TYPE
get_k()
const {
return seq.size(); }
650 std::deque<char> seq;
651 hashing_internals::NUM_HASHES_TYPE num_hashes;
655 std::unique_ptr<uint64_t[]> hash_arr;
Definition nthash_kmer.hpp:521
long get_pos() const
Definition nthash_kmer.hpp:623
uint64_t get_forward_hash() const
Definition nthash_kmer.hpp:641
hashing_internals::K_TYPE get_k() const
Definition nthash_kmer.hpp:635
void peek(char char_in)
Definition nthash_kmer.hpp:593
uint64_t get_reverse_hash() const
Definition nthash_kmer.hpp:647
void roll(char char_in)
Definition nthash_kmer.hpp:567
const uint64_t * hashes() const
Definition nthash_kmer.hpp:616
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition nthash_kmer.hpp:629
BlindNtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, long pos=0)
Definition nthash_kmer.hpp:532
void peek_back(char char_in)
Definition nthash_kmer.hpp:604
void roll_back(char char_in)
Definition nthash_kmer.hpp:580
Definition nthash_kmer.hpp:237
bool peek()
Definition nthash_kmer.hpp:367
bool peek_back()
Definition nthash_kmer.hpp:379
bool peek(char char_in)
Definition nthash_kmer.hpp:393
bool roll()
Definition nthash_kmer.hpp:315
const uint64_t * hashes() const
Definition nthash_kmer.hpp:443
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition nthash_kmer.hpp:456
hashing_internals::K_TYPE get_k() const
Definition nthash_kmer.hpp:462
NtHash(const char *seq, size_t seq_len, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_kmer.hpp:248
bool peek_back(char char_in)
Definition nthash_kmer.hpp:411
uint64_t get_reverse_hash() const
Definition nthash_kmer.hpp:474
size_t get_pos() const
Definition nthash_kmer.hpp:450
NtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_kmer.hpp:278
bool roll_back()
Definition nthash_kmer.hpp:339
uint64_t get_forward_hash() const
Definition nthash_kmer.hpp:468
void check_error(bool condition, const std::string &msg)