btllib
Loading...
Searching...
No Matches
nthash_seed.hpp
1#pragma once
2
3#include <array>
4#include <cstdint>
5#include <cstring>
6#include <deque>
7#include <memory>
8#include <string>
9#include <string_view>
10#include <vector>
11
12#include <btllib/hashing_internals.hpp>
13#include <btllib/status.hpp>
14
15namespace btllib::hashing_internals {
16
17using SpacedSeed = std::vector<unsigned>;
18using SpacedSeedBlocks = std::vector<std::array<unsigned, 2>>;
19using SpacedSeedMonomers = std::vector<unsigned>;
20
21inline void
22get_blocks(const std::vector<std::string>& seed_strings,
23 std::vector<SpacedSeedBlocks>& blocks,
24 std::vector<SpacedSeedMonomers>& monomers)
25{
26 for (const auto& seed_string : seed_strings) {
27 const char pad = seed_string[seed_string.length() - 1] == '1' ? '0' : '1';
28 const std::string padded_string = seed_string + pad;
29 SpacedSeedBlocks care_blocks, ignore_blocks;
30 std::vector<unsigned> care_monos, ignore_monos;
31 unsigned i_start = 0;
32 bool is_care_block = padded_string[0] == '1';
33 for (unsigned pos = 0; pos < padded_string.length(); pos++) {
34 if (is_care_block && padded_string[pos] == '0') {
35 if (pos - i_start == 1) {
36 care_monos.push_back(i_start);
37 } else {
38 const std::array<unsigned, 2> block{ { i_start, pos } };
39 care_blocks.push_back(block);
40 }
41 i_start = pos;
42 is_care_block = false;
43 } else if (!is_care_block && padded_string[pos] == '1') {
44 if (pos - i_start == 1) {
45 ignore_monos.push_back(i_start);
46 } else {
47 const std::array<unsigned, 2> block{ { i_start, pos } };
48 ignore_blocks.push_back(block);
49 }
50 i_start = pos;
51 is_care_block = true;
52 }
53 }
54 const unsigned num_cares = care_blocks.size() * 2 + care_monos.size();
55 const unsigned num_ignores =
56 ignore_blocks.size() * 2 + ignore_monos.size() + 2;
57 if (num_ignores < num_cares) {
58 const unsigned string_end = seed_string.length();
59 const std::array<unsigned, 2> block{ { 0, string_end } };
60 ignore_blocks.push_back(block);
61 blocks.push_back(ignore_blocks);
62 monomers.push_back(ignore_monos);
63 } else {
64 blocks.push_back(care_blocks);
65 monomers.push_back(care_monos);
66 }
67 }
68}
69
70inline void
71parsed_seeds_to_blocks(const std::vector<std::vector<unsigned>>& seeds,
72 unsigned k,
73 std::vector<SpacedSeedBlocks>& blocks,
74 std::vector<SpacedSeedMonomers>& monomers)
75{
76 std::vector<std::string> seed_strings;
77 for (const auto& seed : seeds) {
78 std::string seed_string(k, '1');
79 for (const auto& i : seed) {
80 seed_string[i] = '0';
81 }
82 seed_strings.push_back(seed_string);
83 }
84 get_blocks(seed_strings, blocks, monomers);
85}
86
87inline void
88check_seeds(const std::vector<std::string>& seeds, unsigned k)
89{
90 for (const auto& seed : seeds) {
91 btllib::check_error(seed.length() != k,
92 "SeedNtHash: Spaced seed string length (" +
93 std::to_string(seed.length()) + ") not equal to k=" +
94 std::to_string(k) + " in " + seed);
95 const std::string reversed(seed.rbegin(), seed.rend());
97 seed != reversed,
98 "SeedNtHash: Seed " + seed +
99 " is not symmetric, reverse-complement hashing will be inconsistent");
100 }
101}
102
127inline bool
128ntmsm64(const char* kmer_seq,
129 const std::vector<SpacedSeedBlocks>& seeds_blocks,
130 const std::vector<SpacedSeedMonomers>& seeds_monomers,
131 unsigned k,
132 unsigned m,
133 unsigned m2,
134 uint64_t* fh_nomonos,
135 uint64_t* rh_nomonos,
136 uint64_t* fh_val,
137 uint64_t* rh_val,
138 unsigned& loc_n,
139 uint64_t* h_val)
140{
141 unsigned i_base;
142 uint64_t fh_seed, rh_seed;
143 for (unsigned i_seed = 0; i_seed < m; i_seed++) {
144 fh_seed = 0;
145 rh_seed = 0;
146 for (const auto& block : seeds_blocks[i_seed]) {
147 uint8_t fh_loc, rh_loc, d;
148 uint64_t x;
149 unsigned i = block[0];
150 unsigned j = block[1];
151 switch (j - i) {
152 case 2: {
153 fh_loc = (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 2) | // NOLINT
154 (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]]); // NOLINT
155 rh_loc =
156 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
157 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]); // NOLINT
158 x = DIMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
159 d = k - i - 2; // NOLINT
160 fh_seed ^= d > 0 ? srol(x, d) : x;
161 x = DIMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
162 d = i;
163 rh_seed ^= d > 0 ? srol(x, d) : x;
164 } break;
165 case 3: {
166 fh_loc =
167 (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 4) | // NOLINT
168 (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
169 (CONVERT_TAB[(unsigned char)kmer_seq[i + 2]]); // NOLINT
170 rh_loc =
171 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 4) | // NOLINT
172 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
173 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]);
174 x = TRIMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
175 d = k - i - 3; // NOLINT
176 fh_seed ^= d > 0 ? srol(x, d) : x;
177 x = TRIMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
178 d = i;
179 rh_seed ^= d > 0 ? srol(x, d) : x;
180 } break;
181 case 4: {
182 fh_loc =
183 (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 6) | // NOLINT
184 (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 4) | // NOLINT
185 (CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 2) | // NOLINT
186 (CONVERT_TAB[(unsigned char)kmer_seq[i + 3]]); // NOLINT
187 rh_loc =
188 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 3]] << 6) | // NOLINT
189 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 4) | // NOLINT
190 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
191 (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]);
192 x = TETRAMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
193 d = k - i - 4; // NOLINT
194 fh_seed ^= d > 0 ? srol(x, d) : x;
195 x = TETRAMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
196 d = i;
197 rh_seed ^= d > 0 ? srol(x, d) : x;
198 } break;
199 default: {
200 for (unsigned pos = block[0]; pos < block[1]; pos++) {
201 if (kmer_seq[pos] == SEED_N) {
202 loc_n = pos;
203 return false;
204 }
205 fh_seed ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
206 rh_seed ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
207 }
208 }
209 }
210 }
211 fh_nomonos[i_seed] = fh_seed;
212 rh_nomonos[i_seed] = rh_seed;
213 for (const auto& pos : seeds_monomers[i_seed]) {
214 fh_seed ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
215 rh_seed ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
216 }
217 fh_val[i_seed] = fh_seed;
218 rh_val[i_seed] = rh_seed;
219 i_base = i_seed * m2;
220 h_val[i_base] = canonical(fh_seed, rh_seed);
221 for (unsigned i_hash = 1; i_hash < m2; i_hash++) {
222 h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED);
223 h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT;
224 }
225 }
226 return true;
227}
228
229#define NTMSM64(ROL_HANDLING, IN_HANDLING, OUT_HANDLING, ROR_HANDLING) \
230 unsigned char char_out, char_in; \
231 uint64_t fh_seed, rh_seed; \
232 unsigned i_out, i_in, i_base; \
233 for (unsigned i_seed = 0; i_seed < m; i_seed++) { \
234 ROL_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
235 for (const auto& block : seeds_blocks[i_seed]) \
236 { \
237 IN_HANDLING \
238 OUT_HANDLING \
239 fh_seed ^= srol_table(char_out, k - i_out); \
240 fh_seed ^= srol_table(char_in, k - i_in); \
241 rh_seed ^= srol_table(char_out & CP_OFF, i_out); \
242 rh_seed ^= srol_table(char_in & CP_OFF, i_in); \
243 } \
244 ROR_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
245 fh_nomonos[i_seed] = fh_seed; \
246 rh_nomonos[i_seed] = rh_seed; \
247 for (const auto& pos : seeds_monomers[i_seed]) { \
248 fh_seed ^= srol_table((unsigned char)kmer_seq[pos + 1], k - 1 - pos); \
249 rh_seed ^= srol_table((unsigned char)kmer_seq[pos + 1] & CP_OFF, pos); \
250 } \
251 fh_val[i_seed] = fh_seed; \
252 rh_val[i_seed] = rh_seed; \
253 i_base = i_seed * m2; \
254 h_val[i_base] = canonical(fh_seed, rh_seed); \
255 for (unsigned i_hash = 1; i_hash < m2; i_hash++) { \
256 h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED); \
257 h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT; \
258 } \
259 }
260
282inline void
283ntmsm64(const char* kmer_seq,
284 const std::vector<SpacedSeedBlocks>& seeds_blocks,
285 const std::vector<SpacedSeedMonomers>& seeds_monomers,
286 unsigned k,
287 unsigned m,
288 unsigned m2,
289 uint64_t* fh_nomonos,
290 uint64_t* rh_nomonos,
291 uint64_t* fh_val,
292 uint64_t* rh_val,
293 uint64_t* h_val)
294{
295 NTMSM64(fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
296 , i_in = block[1];
297 char_in = (unsigned char)kmer_seq[i_in];
298 , i_out = block[0];
299 char_out = (unsigned char)kmer_seq[i_out];
300 , rh_seed = sror(rh_seed);)
301}
302
303inline void
304ntmsm64(const std::deque<char>& kmer_seq,
305 const std::vector<SpacedSeedBlocks>& seeds_blocks,
306 const std::vector<SpacedSeedMonomers>& seeds_monomers,
307 unsigned k,
308 unsigned m,
309 unsigned m2,
310 uint64_t* fh_nomonos,
311 uint64_t* rh_nomonos,
312 uint64_t* fh_val,
313 uint64_t* rh_val,
314 uint64_t* h_val)
315{
316 NTMSM64(fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
317 , i_in = block[1];
318 char_in = (unsigned char)kmer_seq[i_in];
319 , i_out = block[0];
320 char_out = (unsigned char)kmer_seq[i_out];
321 , rh_seed = sror(rh_seed);)
322}
323
345inline void
346ntmsm64l(const char* kmer_seq,
347 const std::vector<SpacedSeedBlocks>& seeds_blocks,
348 const std::vector<SpacedSeedMonomers>& seeds_monomers,
349 unsigned k,
350 unsigned m,
351 unsigned m2,
352 uint64_t* fh_nomonos,
353 uint64_t* rh_nomonos,
354 uint64_t* fh_val,
355 uint64_t* rh_val,
356 uint64_t* h_val)
357{
358 NTMSM64(fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
359 , i_in = block[0];
360 char_in = (unsigned char)kmer_seq[i_in];
361 , i_out = block[1];
362 char_out = (unsigned char)kmer_seq[i_out];
363 , fh_seed = sror(fh_seed);)
364}
365
366inline void
367ntmsm64l(const std::deque<char>& kmer_seq,
368 const std::vector<SpacedSeedBlocks>& seeds_blocks,
369 const std::vector<SpacedSeedMonomers>& seeds_monomers,
370 unsigned k,
371 unsigned m,
372 unsigned m2,
373 uint64_t* fh_nomonos,
374 uint64_t* rh_nomonos,
375 uint64_t* fh_val,
376 uint64_t* rh_val,
377 uint64_t* h_val)
378{
379 NTMSM64(fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
380 , i_in = block[0];
381 char_in = (unsigned char)kmer_seq[i_in];
382 , i_out = block[1];
383 char_out = (unsigned char)kmer_seq[i_out];
384 , fh_seed = sror(fh_seed);)
385}
386
408inline void
409ntmsm64(const char* kmer_seq,
410 char in,
411 const std::vector<SpacedSeedBlocks>& seeds_blocks,
412 const std::vector<SpacedSeedMonomers>& seeds_monomers,
413 unsigned k,
414 unsigned m,
415 unsigned m2,
416 uint64_t* fh_nomonos,
417 uint64_t* rh_nomonos,
418 uint64_t* fh_val,
419 uint64_t* rh_val,
420 uint64_t* h_val)
421{
422 NTMSM64(
423 fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
424 , i_in = block[1];
425 if (i_in > k - 1) { char_in = in; } else {
426 char_in = (unsigned char)kmer_seq[i_in];
427 },
428 i_out = block[0];
429 char_out = (unsigned char)kmer_seq[i_out];
430 , rh_seed = sror(rh_seed);)
431}
432
454inline void
455ntmsm64l(const char* kmer_seq,
456 char in,
457 const std::vector<SpacedSeedBlocks>& seeds_blocks,
458 const std::vector<SpacedSeedMonomers>& seeds_monomers,
459 unsigned k,
460 unsigned m,
461 unsigned m2,
462 uint64_t* fh_nomonos,
463 uint64_t* rh_nomonos,
464 uint64_t* fh_val,
465 uint64_t* rh_val,
466 uint64_t* h_val)
467{
468 NTMSM64(
469 fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
470 , i_in = block[0];
471 if (i_in > k - 1) { char_in = in; } else {
472 char_in = (unsigned char)kmer_seq[i_in];
473 },
474 i_out = block[1];
475 char_out = (unsigned char)kmer_seq[i_out];
476 , fh_seed = sror(fh_seed);)
477}
478
479} // namespace btllib::hashing_internals
480
481namespace btllib {
482
483using hashing_internals::check_seeds;
484using hashing_internals::get_blocks;
485using hashing_internals::ntmsm64;
486using hashing_internals::ntmsm64l;
487using hashing_internals::parsed_seeds_to_blocks;
488using hashing_internals::SEED_N;
489using hashing_internals::SEED_TAB;
490
495inline std::vector<std::vector<unsigned>>
496parse_seeds(const std::vector<std::string>& seed_strings)
497{
498 std::vector<std::vector<unsigned>> seed_set;
499 for (const auto& seed_string : seed_strings) {
500 std::vector<unsigned> seed;
501 size_t pos = 0;
502 for (const auto& c : seed_string) {
503 if (c != '1') {
504 seed.push_back(pos);
505 }
506 ++pos;
507 }
508 seed_set.push_back(seed);
509 }
510 return seed_set;
511}
512
517{
518
519public:
530 SeedNtHash(const char* seq,
531 size_t seq_len,
532 const std::vector<std::string>& seeds,
533 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
534 hashing_internals::K_TYPE k,
535 size_t pos = 0)
536 : seq(seq)
537 , seq_len(seq_len)
538 , num_hashes_per_seed(num_hashes_per_seed)
539 , k(k)
540 , pos(pos)
541 , initialized(false)
542 , fwd_hash_nomonos(new uint64_t[seeds.size()])
543 , rev_hash_nomonos(new uint64_t[seeds.size()])
544 , fwd_hash(new uint64_t[seeds.size()])
545 , rev_hash(new uint64_t[seeds.size()])
546 , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
547 {
548 check_seeds(seeds, k);
549 check_error(seeds[0].size() != k,
550 "SeedNtHash: k should be equal to seed string lengths");
551 get_blocks(seeds, blocks, monomers);
552 }
553
563 SeedNtHash(const std::string& seq,
564 const std::vector<std::string>& seeds,
565 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
566 hashing_internals::K_TYPE k,
567 size_t pos = 0)
568 : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
569 {
570 }
571
582 SeedNtHash(const char* seq,
583 size_t seq_len,
584 const std::vector<std::vector<unsigned>>& seeds,
585 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
586 hashing_internals::K_TYPE k,
587 size_t pos = 0)
588 : seq(seq)
589 , seq_len(seq_len)
590 , num_hashes_per_seed(num_hashes_per_seed)
591 , k(k)
592 , pos(pos)
593 , initialized(false)
594 , fwd_hash_nomonos(new uint64_t[seeds.size()])
595 , rev_hash_nomonos(new uint64_t[seeds.size()])
596 , fwd_hash(new uint64_t[seeds.size()])
597 , rev_hash(new uint64_t[seeds.size()])
598 , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
599 {
600 parsed_seeds_to_blocks(seeds, k, blocks, monomers);
601 }
602
612 SeedNtHash(const std::string& seq,
613 const std::vector<std::vector<unsigned>>& seeds,
614 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
615 hashing_internals::K_TYPE k,
616 size_t pos = 0)
617 : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
618 {
619 }
620
621 SeedNtHash(const SeedNtHash& obj)
622 : seq(obj.seq)
623 , seq_len(obj.seq_len)
624 , num_hashes_per_seed(obj.num_hashes_per_seed)
625 , k(obj.k)
626 , pos(obj.pos)
627 , initialized(obj.initialized)
628 , blocks(obj.blocks)
629 , monomers(obj.monomers)
630 , fwd_hash_nomonos(new uint64_t[obj.blocks.size()])
631 , rev_hash_nomonos(new uint64_t[obj.blocks.size()])
632 , fwd_hash(new uint64_t[obj.blocks.size()])
633 , rev_hash(new uint64_t[obj.blocks.size()])
634 , hash_arr(new uint64_t[obj.num_hashes_per_seed * obj.blocks.size()])
635 {
636 std::memcpy(fwd_hash_nomonos.get(),
637 obj.fwd_hash_nomonos.get(),
638 obj.blocks.size() * sizeof(uint64_t));
639 std::memcpy(rev_hash_nomonos.get(),
640 obj.rev_hash_nomonos.get(),
641 obj.blocks.size() * sizeof(uint64_t));
642 std::memcpy(
643 fwd_hash.get(), obj.fwd_hash.get(), obj.blocks.size() * sizeof(uint64_t));
644 std::memcpy(
645 rev_hash.get(), obj.rev_hash.get(), obj.blocks.size() * sizeof(uint64_t));
646 std::memcpy(hash_arr.get(),
647 obj.hash_arr.get(),
648 obj.num_hashes_per_seed * obj.blocks.size() * sizeof(uint64_t));
649 }
650
651 SeedNtHash(SeedNtHash&&) = default;
652
658 bool roll()
659 {
660 if (!initialized) {
661 return init();
662 }
663 if (pos >= seq_len - k) {
664 return false;
665 }
666 if (SEED_TAB[(unsigned char)seq[pos + k]] == SEED_N) {
667 pos += k;
668 return init();
669 }
670 ntmsm64(seq + pos,
671 blocks,
672 monomers,
673 k,
674 blocks.size(),
675 num_hashes_per_seed,
676 fwd_hash_nomonos.get(),
677 rev_hash_nomonos.get(),
678 fwd_hash.get(),
679 rev_hash.get(),
680 hash_arr.get());
681 ++pos;
682 return true;
683 }
684
690 {
691 if (!initialized) {
692 return init();
693 }
694 if (pos == 0) {
695 return false;
696 }
697 if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N && pos >= k) {
698 pos -= k;
699 return init();
700 }
701 if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N) {
702 return false;
703 }
704 ntmsm64l(seq + pos - 1,
705 blocks,
706 monomers,
707 k,
708 blocks.size(),
709 num_hashes_per_seed,
710 fwd_hash_nomonos.get(),
711 rev_hash_nomonos.get(),
712 fwd_hash.get(),
713 rev_hash.get(),
714 hash_arr.get());
715 --pos;
716 return true;
717 }
718
724 bool peek()
725 {
726 if (pos >= seq_len - k) {
727 return false;
728 }
729 return peek(seq[pos + k]);
730 }
731
737 {
738 if (pos == 0) {
739 return false;
740 }
741 return peek_back(seq[pos - 1]);
742 }
743
748 bool peek(char char_in)
749 {
750 if (!initialized) {
751 return init();
752 }
753 const std::unique_ptr<uint64_t[]> fwd_hash_nomonos_cpy(
754 new uint64_t[blocks.size()]);
755 const std::unique_ptr<uint64_t[]> rev_hash_nomonos_cpy(
756 new uint64_t[blocks.size()]);
757 const std::unique_ptr<uint64_t[]> fwd_hash_cpy(new uint64_t[blocks.size()]);
758 const std::unique_ptr<uint64_t[]> rev_hash_cpy(new uint64_t[blocks.size()]);
759 std::memcpy(fwd_hash_nomonos_cpy.get(),
760 fwd_hash_nomonos.get(),
761 blocks.size() * sizeof(uint64_t));
762 std::memcpy(rev_hash_nomonos_cpy.get(),
763 rev_hash_nomonos.get(),
764 blocks.size() * sizeof(uint64_t));
765 std::memcpy(
766 fwd_hash_cpy.get(), fwd_hash.get(), blocks.size() * sizeof(uint64_t));
767 std::memcpy(
768 rev_hash_cpy.get(), rev_hash.get(), blocks.size() * sizeof(uint64_t));
769 ntmsm64(seq + pos,
770 char_in,
771 blocks,
772 monomers,
773 k,
774 blocks.size(),
775 num_hashes_per_seed,
776 fwd_hash_nomonos_cpy.get(),
777 rev_hash_nomonos_cpy.get(),
778 fwd_hash_cpy.get(),
779 rev_hash_cpy.get(),
780 hash_arr.get());
781 return true;
782 }
783
788 bool peek_back(char char_in)
789 {
790 if (!initialized) {
791 return init();
792 }
793 const std::unique_ptr<uint64_t[]> fwd_hash_nomonos_cpy(
794 new uint64_t[blocks.size()]);
795 const std::unique_ptr<uint64_t[]> rev_hash_nomonos_cpy(
796 new uint64_t[blocks.size()]);
797 const std::unique_ptr<uint64_t[]> fwd_hash_cpy(new uint64_t[blocks.size()]);
798 const std::unique_ptr<uint64_t[]> rev_hash_cpy(new uint64_t[blocks.size()]);
799 std::memcpy(fwd_hash_nomonos_cpy.get(),
800 fwd_hash_nomonos.get(),
801 blocks.size() * sizeof(uint64_t));
802 std::memcpy(rev_hash_nomonos_cpy.get(),
803 rev_hash_nomonos.get(),
804 blocks.size() * sizeof(uint64_t));
805 std::memcpy(
806 fwd_hash_cpy.get(), fwd_hash.get(), blocks.size() * sizeof(uint64_t));
807 std::memcpy(
808 rev_hash_cpy.get(), rev_hash.get(), blocks.size() * sizeof(uint64_t));
809 ntmsm64l(seq + pos - 1,
810 char_in,
811 blocks,
812 monomers,
813 k,
814 blocks.size(),
815 num_hashes_per_seed,
816 fwd_hash_nomonos_cpy.get(),
817 rev_hash_nomonos_cpy.get(),
818 fwd_hash_cpy.get(),
819 rev_hash_cpy.get(),
820 hash_arr.get());
821 return true;
822 }
823
828 const uint64_t* hashes() const { return hash_arr.get(); }
829
835 size_t get_pos() const { return pos; }
836
841 unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
842
847 hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
848 {
849 return num_hashes_per_seed;
850 }
851
856 hashing_internals::K_TYPE get_k() const { return k; }
857
862 uint64_t* get_forward_hash() const { return fwd_hash.get(); }
863
868 uint64_t* get_reverse_hash() const { return rev_hash.get(); }
869
870private:
871 const char* seq;
872 const size_t seq_len;
873 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed;
874 hashing_internals::K_TYPE k;
875 size_t pos;
876 bool initialized;
877 std::vector<hashing_internals::SpacedSeedBlocks> blocks;
878 std::vector<hashing_internals::SpacedSeedMonomers> monomers;
879 std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
880 std::unique_ptr<uint64_t[]> rev_hash_nomonos;
881 std::unique_ptr<uint64_t[]> fwd_hash;
882 std::unique_ptr<uint64_t[]> rev_hash;
883 std::unique_ptr<uint64_t[]> hash_arr;
884
889 bool init()
890 {
891 unsigned pos_n = 0;
892 while (pos < seq_len - k + 1 && !ntmsm64(seq + pos,
893 blocks,
894 monomers,
895 k,
896 blocks.size(),
897 num_hashes_per_seed,
898 fwd_hash_nomonos.get(),
899 rev_hash_nomonos.get(),
900 fwd_hash.get(),
901 rev_hash.get(),
902 pos_n,
903 hash_arr.get())) {
904 pos += pos_n + 1;
905 }
906 if (pos > seq_len - k) {
907 return false;
908 }
909 initialized = true;
910 return true;
911 }
912};
913
919{
920
921public:
932 BlindSeedNtHash(const char* seq,
933 const std::vector<std::string>& seeds,
934 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
935 hashing_internals::K_TYPE k,
936 long pos = 0)
937 : seq(seq + pos, seq + pos + k)
938 , num_hashes_per_seed(num_hashes_per_seed)
939 , k(k)
940 , pos(pos)
941 , fwd_hash_nomonos(new uint64_t[seeds.size()])
942 , rev_hash_nomonos(new uint64_t[seeds.size()])
943 , fwd_hash(new uint64_t[seeds.size()])
944 , rev_hash(new uint64_t[seeds.size()])
945 , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
946 {
947 check_seeds(seeds, k);
948 get_blocks(seeds, blocks, monomers);
949 unsigned pos_n = 0;
950 ntmsm64(seq + pos,
951 blocks,
952 monomers,
953 k,
954 blocks.size(),
955 num_hashes_per_seed,
956 fwd_hash_nomonos.get(),
957 rev_hash_nomonos.get(),
958 fwd_hash.get(),
959 rev_hash.get(),
960 pos_n,
961 hash_arr.get());
962 }
963
964 BlindSeedNtHash(const BlindSeedNtHash& seed_nthash)
965 : seq(seed_nthash.seq)
966 , num_hashes_per_seed(seed_nthash.num_hashes_per_seed)
967 , k(seed_nthash.k)
968 , pos(seed_nthash.pos)
969 , blocks(seed_nthash.blocks)
970 , monomers(seed_nthash.monomers)
971 , fwd_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
972 , rev_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
973 , fwd_hash(new uint64_t[seed_nthash.blocks.size()])
974 , rev_hash(new uint64_t[seed_nthash.blocks.size()])
975 , hash_arr(new uint64_t[num_hashes_per_seed * seed_nthash.blocks.size()])
976 {
977 std::memcpy(fwd_hash_nomonos.get(),
978 seed_nthash.fwd_hash_nomonos.get(),
979 seed_nthash.blocks.size() * sizeof(uint64_t));
980 std::memcpy(rev_hash_nomonos.get(),
981 seed_nthash.rev_hash_nomonos.get(),
982 seed_nthash.blocks.size() * sizeof(uint64_t));
983 std::memcpy(fwd_hash.get(),
984 seed_nthash.fwd_hash.get(),
985 seed_nthash.blocks.size() * sizeof(uint64_t));
986 std::memcpy(rev_hash.get(),
987 seed_nthash.rev_hash.get(),
988 seed_nthash.blocks.size() * sizeof(uint64_t));
989 std::memcpy(hash_arr.get(),
990 seed_nthash.hash_arr.get(),
991 num_hashes_per_seed * seed_nthash.blocks.size() *
992 sizeof(uint64_t));
993 }
994
995 BlindSeedNtHash(BlindSeedNtHash&&) = default;
996
1002 void roll(char char_in)
1003 {
1004 seq.push_back(char_in);
1005 ntmsm64(seq,
1006 blocks,
1007 monomers,
1008 k,
1009 blocks.size(),
1010 num_hashes_per_seed,
1011 fwd_hash_nomonos.get(),
1012 rev_hash_nomonos.get(),
1013 fwd_hash.get(),
1014 rev_hash.get(),
1015 hash_arr.get());
1016 seq.pop_front();
1017 ++pos;
1018 }
1019
1023 void roll_back(char char_in)
1024 {
1025 seq.push_front(char_in);
1026 ntmsm64l(seq,
1027 blocks,
1028 monomers,
1029 k,
1030 blocks.size(),
1031 num_hashes_per_seed,
1032 fwd_hash_nomonos.get(),
1033 rev_hash_nomonos.get(),
1034 fwd_hash.get(),
1035 rev_hash.get(),
1036 hash_arr.get());
1037 seq.pop_back();
1038 --pos;
1039 }
1040
1045 const uint64_t* hashes() const { return hash_arr.get(); }
1046
1052 long get_pos() const { return pos; }
1053
1058 unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
1059
1064 hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
1065 {
1066 return num_hashes_per_seed;
1067 }
1068
1073 hashing_internals::K_TYPE get_k() const { return k; }
1074
1079 uint64_t* get_forward_hash() const { return fwd_hash.get(); }
1080
1085 uint64_t* get_reverse_hash() const { return rev_hash.get(); }
1086
1087private:
1088 std::deque<char> seq;
1089 hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed;
1090 hashing_internals::K_TYPE k;
1091 long pos;
1092 std::vector<hashing_internals::SpacedSeedBlocks> blocks;
1093 std::vector<hashing_internals::SpacedSeedMonomers> monomers;
1094 std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
1095 std::unique_ptr<uint64_t[]> rev_hash_nomonos;
1096 std::unique_ptr<uint64_t[]> fwd_hash;
1097 std::unique_ptr<uint64_t[]> rev_hash;
1098 std::unique_ptr<uint64_t[]> hash_arr;
1099};
1100
1101} // namespace btllib
Definition nthash_seed.hpp:919
void roll_back(char char_in)
Definition nthash_seed.hpp:1023
BlindSeedNtHash(const char *seq, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, long pos=0)
Definition nthash_seed.hpp:932
void roll(char char_in)
Definition nthash_seed.hpp:1002
uint64_t * get_forward_hash() const
Definition nthash_seed.hpp:1079
hashing_internals::K_TYPE get_k() const
Definition nthash_seed.hpp:1073
long get_pos() const
Definition nthash_seed.hpp:1052
hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition nthash_seed.hpp:1064
unsigned get_hash_num() const
Definition nthash_seed.hpp:1058
uint64_t * get_reverse_hash() const
Definition nthash_seed.hpp:1085
const uint64_t * hashes() const
Definition nthash_seed.hpp:1045
Definition nthash_seed.hpp:517
bool peek_back(char char_in)
Definition nthash_seed.hpp:788
uint64_t * get_forward_hash() const
Definition nthash_seed.hpp:862
bool roll_back()
Definition nthash_seed.hpp:689
uint64_t * get_reverse_hash() const
Definition nthash_seed.hpp:868
bool peek_back()
Definition nthash_seed.hpp:736
size_t get_pos() const
Definition nthash_seed.hpp:835
SeedNtHash(const std::string &seq, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_seed.hpp:563
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::vector< unsigned > > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_seed.hpp:582
SeedNtHash(const std::string &seq, const std::vector< std::vector< unsigned > > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_seed.hpp:612
bool peek()
Definition nthash_seed.hpp:724
bool roll()
Definition nthash_seed.hpp:658
unsigned get_hash_num() const
Definition nthash_seed.hpp:841
bool peek(char char_in)
Definition nthash_seed.hpp:748
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition nthash_seed.hpp:530
hashing_internals::K_TYPE get_k() const
Definition nthash_seed.hpp:856
const uint64_t * hashes() const
Definition nthash_seed.hpp:828
hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition nthash_seed.hpp:847
Definition aahash.hpp:12
void check_error(bool condition, const std::string &msg)
void check_warning(bool condition, const std::string &msg)
std::vector< std::vector< unsigned > > parse_seeds(const std::vector< std::string > &seed_strings)
Definition nthash_seed.hpp:496