ntHash
Loading...
Searching...
No Matches
nthash.hpp
1#pragma once
2
3#include <array>
4#include <cstdint>
5#include <cstring>
6#include <deque>
7#include <memory>
8#include <string>
9#include <vector>
10
11namespace nthash {
12
18static const char* const NTHASH_FN_NAME = "ntHash_v2";
19
24namespace typedefs {
25using NUM_HASHES_TYPE = uint8_t;
26using K_TYPE = uint16_t;
27using SpacedSeedBlocks = std::vector<std::array<unsigned, 2>>;
28using SpacedSeedMonomers = std::vector<unsigned>;
29} // namespace typedefs
30
34class NtHash;
35
42class BlindNtHash;
43
47class SeedNtHash;
48
53class BlindSeedNtHash;
54
59std::vector<std::vector<unsigned>>
60parse_seeds(const std::vector<std::string>& seed_strings);
61
62class NtHash
63{
64
65public:
74 NtHash(const char* seq,
75 size_t seq_len,
76 typedefs::NUM_HASHES_TYPE num_hashes,
77 typedefs::K_TYPE k,
78 size_t pos = 0);
79
87 NtHash(const std::string& seq,
88 typedefs::NUM_HASHES_TYPE num_hashes,
89 typedefs::K_TYPE k,
90 size_t pos = 0)
91 : NtHash(seq.data(), seq.size(), num_hashes, k, pos)
92 {
93 }
94
95 NtHash(const NtHash& obj)
96 : seq(obj.seq)
97 , num_hashes(obj.num_hashes)
98 , k(obj.k)
99 , pos(obj.pos)
100 , initialized(obj.initialized)
101 , fwd_hash(obj.fwd_hash)
102 , rev_hash(obj.rev_hash)
103 , hash_arr(new uint64_t[obj.num_hashes])
104 {
105 std::memcpy(
106 hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
107 }
108
109 NtHash(NtHash&&) = default;
110
123 bool roll();
124
129 bool roll_back();
130
137 bool peek();
138
143 bool peek_back();
144
151 bool peek(char char_in);
152
157 bool peek_back(char char_in);
158
163 const uint64_t* hashes() const { return hash_arr.get(); }
164
170 size_t get_pos() const { return pos; }
171
176 typedefs::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
177
182 typedefs::K_TYPE get_k() const { return k; }
183
188 uint64_t get_forward_hash() const { return fwd_hash; }
189
194 uint64_t get_reverse_hash() const { return rev_hash; }
195
196private:
197 std::string_view seq;
198 typedefs::NUM_HASHES_TYPE num_hashes;
199 typedefs::K_TYPE k;
200 size_t pos;
201 bool initialized;
202 uint64_t fwd_hash = 0;
203 uint64_t rev_hash = 0;
204 std::unique_ptr<uint64_t[]> hash_arr;
205
210 bool init();
211};
212
214{
215
216public:
225 BlindNtHash(const char* seq,
226 typedefs::NUM_HASHES_TYPE num_hashes,
227 typedefs::K_TYPE k,
228 ssize_t pos = 0);
229
230 BlindNtHash(const BlindNtHash& obj)
231 : seq(obj.seq)
232 , num_hashes(obj.num_hashes)
233 , pos(obj.pos)
234 , fwd_hash(obj.fwd_hash)
235 , rev_hash(obj.rev_hash)
236 , hash_arr(new uint64_t[obj.num_hashes])
237 {
238 std::memcpy(
239 hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
240 }
241
242 BlindNtHash(BlindNtHash&&) = default;
243
250 void roll(char char_in);
251
255 void roll_back(char char_in);
256
260 void peek(char char_in);
261
265 void peek_back(char char_in);
266
271 const uint64_t* hashes() const { return hash_arr.get(); }
272
278 ssize_t get_pos() const { return pos; }
279
284 typedefs::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
285
290 typedefs::K_TYPE get_k() const { return seq.size(); }
291
296 uint64_t get_forward_hash() const { return fwd_hash; }
297
302 uint64_t get_reverse_hash() const { return rev_hash; }
303
304private:
305 std::deque<char> seq;
306 typedefs::NUM_HASHES_TYPE num_hashes;
307 ssize_t pos;
308 uint64_t fwd_hash = 0;
309 uint64_t rev_hash = 0;
310 std::unique_ptr<uint64_t[]> hash_arr;
311};
312
314{
315
316public:
327 SeedNtHash(const char* seq,
328 size_t seq_len,
329 const std::vector<std::string>& seeds,
330 typedefs::NUM_HASHES_TYPE num_hashes_per_seed,
331 typedefs::K_TYPE k,
332 size_t pos = 0);
333
343 SeedNtHash(const std::string& seq,
344 const std::vector<std::string>& seeds,
345 typedefs::NUM_HASHES_TYPE num_hashes_per_seed,
346 typedefs::K_TYPE k,
347 size_t pos = 0)
348 : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
349 {
350 }
351
362 SeedNtHash(const char* seq,
363 size_t seq_len,
364 const std::vector<std::vector<unsigned>>& seeds,
365 typedefs::NUM_HASHES_TYPE num_hashes_per_seed,
366 typedefs::K_TYPE k,
367 size_t pos = 0);
368
378 SeedNtHash(const std::string& seq,
379 const std::vector<std::vector<unsigned>>& seeds,
380 typedefs::NUM_HASHES_TYPE num_hashes_per_seed,
381 typedefs::K_TYPE k,
382 size_t pos = 0)
383 : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
384 {
385 }
386
387 SeedNtHash(const SeedNtHash& obj)
388 : seq(obj.seq)
389 , num_hashes_per_seed(obj.num_hashes_per_seed)
390 , k(obj.k)
391 , pos(obj.pos)
392 , initialized(obj.initialized)
393 , blocks(obj.blocks)
394 , monomers(obj.monomers)
395 , fwd_hash_nomonos(new uint64_t[obj.blocks.size()])
396 , rev_hash_nomonos(new uint64_t[obj.blocks.size()])
397 , fwd_hash(new uint64_t[obj.blocks.size()])
398 , rev_hash(new uint64_t[obj.blocks.size()])
399 , hash_arr(new uint64_t[obj.num_hashes_per_seed * obj.blocks.size()])
400 {
401 std::memcpy(fwd_hash_nomonos.get(),
402 obj.fwd_hash_nomonos.get(),
403 obj.blocks.size() * sizeof(uint64_t));
404 std::memcpy(rev_hash_nomonos.get(),
405 obj.rev_hash_nomonos.get(),
406 obj.blocks.size() * sizeof(uint64_t));
407 std::memcpy(
408 fwd_hash.get(), obj.fwd_hash.get(), obj.blocks.size() * sizeof(uint64_t));
409 std::memcpy(
410 rev_hash.get(), obj.rev_hash.get(), obj.blocks.size() * sizeof(uint64_t));
411 std::memcpy(hash_arr.get(),
412 obj.hash_arr.get(),
413 obj.num_hashes_per_seed * obj.blocks.size() * sizeof(uint64_t));
414 }
415
416 SeedNtHash(SeedNtHash&&) = default;
417
423 bool roll();
424
429 bool roll_back();
430
436 bool peek();
437
442 bool peek_back();
443
448 bool peek(char char_in);
449
454 bool peek_back(char char_in);
455
460 const uint64_t* hashes() const { return hash_arr.get(); }
461
467 size_t get_pos() const { return pos; }
468
473 unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
474
479 typedefs::NUM_HASHES_TYPE get_hash_num_per_seed() const
480 {
481 return num_hashes_per_seed;
482 }
483
488 typedefs::K_TYPE get_k() const { return k; }
489
494 uint64_t* get_forward_hash() const { return fwd_hash.get(); }
495
500 uint64_t* get_reverse_hash() const { return rev_hash.get(); }
501
502private:
503 std::string_view seq;
504 typedefs::NUM_HASHES_TYPE num_hashes_per_seed;
505 typedefs::K_TYPE k;
506 size_t pos;
507 bool initialized;
508 std::vector<typedefs::SpacedSeedBlocks> blocks;
509 std::vector<typedefs::SpacedSeedMonomers> monomers;
510 std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
511 std::unique_ptr<uint64_t[]> rev_hash_nomonos;
512 std::unique_ptr<uint64_t[]> fwd_hash;
513 std::unique_ptr<uint64_t[]> rev_hash;
514 std::unique_ptr<uint64_t[]> hash_arr;
515
520 bool init();
521};
522
524{
525
526public:
537 BlindSeedNtHash(const char* seq,
538 const std::vector<std::string>& seeds,
539 typedefs::NUM_HASHES_TYPE num_hashes_per_seed,
540 typedefs::K_TYPE k,
541 ssize_t pos = 0);
542
543 BlindSeedNtHash(const BlindSeedNtHash& seed_nthash)
544 : seq(seed_nthash.seq)
545 , num_hashes_per_seed(seed_nthash.num_hashes_per_seed)
546 , k(seed_nthash.k)
547 , pos(seed_nthash.pos)
548 , blocks(seed_nthash.blocks)
549 , monomers(seed_nthash.monomers)
550 , fwd_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
551 , rev_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
552 , fwd_hash(new uint64_t[seed_nthash.blocks.size()])
553 , rev_hash(new uint64_t[seed_nthash.blocks.size()])
554 , hash_arr(new uint64_t[num_hashes_per_seed * seed_nthash.blocks.size()])
555 {
556 std::memcpy(fwd_hash_nomonos.get(),
557 seed_nthash.fwd_hash_nomonos.get(),
558 seed_nthash.blocks.size() * sizeof(uint64_t));
559 std::memcpy(rev_hash_nomonos.get(),
560 seed_nthash.rev_hash_nomonos.get(),
561 seed_nthash.blocks.size() * sizeof(uint64_t));
562 std::memcpy(fwd_hash.get(),
563 seed_nthash.fwd_hash.get(),
564 seed_nthash.blocks.size() * sizeof(uint64_t));
565 std::memcpy(rev_hash.get(),
566 seed_nthash.rev_hash.get(),
567 seed_nthash.blocks.size() * sizeof(uint64_t));
568 std::memcpy(hash_arr.get(),
569 seed_nthash.hash_arr.get(),
570 num_hashes_per_seed * seed_nthash.blocks.size() *
571 sizeof(uint64_t));
572 }
573
574 BlindSeedNtHash(BlindSeedNtHash&&) = default;
575
581 void roll(char char_in);
582
586 void roll_back(char char_in);
587
592 const uint64_t* hashes() const { return hash_arr.get(); }
593
599 ssize_t get_pos() const { return pos; }
600
605 unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
606
611 typedefs::NUM_HASHES_TYPE get_hash_num_per_seed() const
612 {
613 return num_hashes_per_seed;
614 }
615
620 typedefs::K_TYPE get_k() const { return k; }
621
626 uint64_t* get_forward_hash() const { return fwd_hash.get(); }
627
632 uint64_t* get_reverse_hash() const { return rev_hash.get(); }
633
634private:
635 std::deque<char> seq;
636 typedefs::NUM_HASHES_TYPE num_hashes_per_seed;
637 typedefs::K_TYPE k;
638 ssize_t pos;
639 std::vector<typedefs::SpacedSeedBlocks> blocks;
640 std::vector<typedefs::SpacedSeedMonomers> monomers;
641 std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
642 std::unique_ptr<uint64_t[]> rev_hash_nomonos;
643 std::unique_ptr<uint64_t[]> fwd_hash;
644 std::unique_ptr<uint64_t[]> rev_hash;
645 std::unique_ptr<uint64_t[]> hash_arr;
646};
647
648} // namespace nthash
Definition nthash.hpp:214
void roll_back(char char_in)
typedefs::NUM_HASHES_TYPE get_hash_num() const
Definition nthash.hpp:284
void peek_back(char char_in)
uint64_t get_reverse_hash() const
Definition nthash.hpp:302
void roll(char char_in)
BlindNtHash(const char *seq, typedefs::NUM_HASHES_TYPE num_hashes, typedefs::K_TYPE k, ssize_t pos=0)
void peek(char char_in)
ssize_t get_pos() const
Definition nthash.hpp:278
uint64_t get_forward_hash() const
Definition nthash.hpp:296
typedefs::K_TYPE get_k() const
Definition nthash.hpp:290
const uint64_t * hashes() const
Definition nthash.hpp:271
Definition nthash.hpp:524
const uint64_t * hashes() const
Definition nthash.hpp:592
ssize_t get_pos() const
Definition nthash.hpp:599
unsigned get_hash_num() const
Definition nthash.hpp:605
typedefs::K_TYPE get_k() const
Definition nthash.hpp:620
void roll_back(char char_in)
void roll(char char_in)
uint64_t * get_reverse_hash() const
Definition nthash.hpp:632
BlindSeedNtHash(const char *seq, const std::vector< std::string > &seeds, typedefs::NUM_HASHES_TYPE num_hashes_per_seed, typedefs::K_TYPE k, ssize_t pos=0)
uint64_t * get_forward_hash() const
Definition nthash.hpp:626
typedefs::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition nthash.hpp:611
Definition nthash.hpp:63
typedefs::NUM_HASHES_TYPE get_hash_num() const
Definition nthash.hpp:176
uint64_t get_forward_hash() const
Definition nthash.hpp:188
bool peek(char char_in)
uint64_t get_reverse_hash() const
Definition nthash.hpp:194
NtHash(const std::string &seq, typedefs::NUM_HASHES_TYPE num_hashes, typedefs::K_TYPE k, size_t pos=0)
Definition nthash.hpp:87
NtHash(const char *seq, size_t seq_len, typedefs::NUM_HASHES_TYPE num_hashes, typedefs::K_TYPE k, size_t pos=0)
const uint64_t * hashes() const
Definition nthash.hpp:163
typedefs::K_TYPE get_k() const
Definition nthash.hpp:182
bool peek_back(char char_in)
size_t get_pos() const
Definition nthash.hpp:170
Definition nthash.hpp:314
size_t get_pos() const
Definition nthash.hpp:467
SeedNtHash(const std::string &seq, const std::vector< std::vector< unsigned > > &seeds, typedefs::NUM_HASHES_TYPE num_hashes_per_seed, typedefs::K_TYPE k, size_t pos=0)
Definition nthash.hpp:378
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::string > &seeds, typedefs::NUM_HASHES_TYPE num_hashes_per_seed, typedefs::K_TYPE k, size_t pos=0)
bool peek_back(char char_in)
uint64_t * get_forward_hash() const
Definition nthash.hpp:494
bool peek(char char_in)
const uint64_t * hashes() const
Definition nthash.hpp:460
unsigned get_hash_num() const
Definition nthash.hpp:473
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::vector< unsigned > > &seeds, typedefs::NUM_HASHES_TYPE num_hashes_per_seed, typedefs::K_TYPE k, size_t pos=0)
typedefs::K_TYPE get_k() const
Definition nthash.hpp:488
SeedNtHash(const std::string &seq, const std::vector< std::string > &seeds, typedefs::NUM_HASHES_TYPE num_hashes_per_seed, typedefs::K_TYPE k, size_t pos=0)
Definition nthash.hpp:343
uint64_t * get_reverse_hash() const
Definition nthash.hpp:500
typedefs::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition nthash.hpp:479