btllib
Loading...
Searching...
No Matches
counting_bloom_filter.hpp
1#ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP
2#define BTLLIB_COUNTING_BLOOM_FILTER_HPP
3
4#include "btllib/bloom_filter.hpp"
5#include "btllib/counting_bloom_filter.hpp"
6#include "btllib/nthash.hpp"
7#include "btllib/status.hpp"
8
9// clang-format off
10// NOLINTBEGIN llvm-include-order
11#include <limits>
12#include "cpptoml.h"
13// NOLINTEND llvm-include-order
14// clang-format on
15
16#include <atomic>
17#include <cmath>
18#include <cstdint>
19#include <fstream>
20#include <memory>
21#include <string>
22#include <vector>
23
24namespace btllib {
25
26// NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
27static const char* const COUNTING_BLOOM_FILTER_SIGNATURE =
28 "[BTLCountingBloomFilter_v5]";
29// NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
30static const char* const KMER_COUNTING_BLOOM_FILTER_SIGNATURE =
31 "[BTLKmerCountingBloomFilter_v5]";
32
33template<typename T>
34class KmerCountingBloomFilter;
35
41template<typename T>
43{
44
45public:
48
56 CountingBloomFilter(size_t bytes,
57 unsigned hash_num,
58 std::string hash_fn = "");
59
65 explicit CountingBloomFilter(const std::string& path);
66
69
70 CountingBloomFilter& operator=(const CountingBloomFilter&) = delete;
71 CountingBloomFilter& operator=(CountingBloomFilter&&) = delete;
72
80 void insert(const uint64_t* hashes, T n = 1);
81
88 void insert(const std::vector<uint64_t>& hashes, T n = 1)
89 {
90 insert(hashes.data(), n);
91 }
92
99 void remove(const uint64_t* hashes);
100
106 void remove(const std::vector<uint64_t>& hashes) { remove(hashes.data()); }
107
114 void clear(const uint64_t* hashes);
115
121 void clear(const std::vector<uint64_t>& hashes) { clear(hashes.data()); }
122
131 T contains(const uint64_t* hashes) const;
132
140 T contains(const std::vector<uint64_t>& hashes) const
141 {
142 return contains(hashes.data());
143 }
144
154 T contains_insert(const uint64_t* hashes, T n = 1);
155
164 T contains_insert(const std::vector<uint64_t>& hashes, T n = 1)
165 {
166 return contains_insert(hashes.data(), n);
167 }
168
179 T insert_contains(const uint64_t* hashes, T n = 1);
180
189 T insert_contains(const std::vector<uint64_t>& hashes, T n = 1)
190 {
191 return insert_contains(hashes.data(), n);
192 }
193
205 T insert_thresh_contains(const uint64_t* hashes, T threshold);
206
218 T insert_thresh_contains(const std::vector<uint64_t>& hashes,
219 const T threshold)
220 {
221 return insert_thresh_contains(hashes.data(), threshold);
222 }
223
235 T contains_insert_thresh(const uint64_t* hashes, T threshold);
236
246 T contains_insert_thresh(const std::vector<uint64_t>& hashes,
247 const T threshold)
248 {
249 return contains_insert_thresh(hashes.data(), threshold);
250 }
251
253 size_t get_bytes() const { return bytes; }
256 uint64_t get_pop_cnt(T threshold = 1) const;
258 double get_occupancy(T threshold = 1) const;
260 unsigned get_hash_num() const { return hash_num; }
267 double get_fpr(T threshold = 1) const;
269 const std::string& get_hash_fn() const { return hash_fn; }
270
276 void save(const std::string& path);
277
283 static bool is_bloom_file(const std::string& path)
284 {
285 return btllib::BloomFilter::check_file_signature(
286 path, COUNTING_BLOOM_FILTER_SIGNATURE);
287 }
288
289private:
290 CountingBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
291
292 void set(const uint64_t* hashes, T min_val, T new_val);
293
294 friend class KmerCountingBloomFilter<T>;
295
296 size_t bytes = 0;
297 size_t array_size = 0;
298 unsigned hash_num = 0;
299 std::string hash_fn;
300 std::unique_ptr<std::atomic<T>[]> array;
301};
302
308template<typename T>
310{
311
312public:
315
323 KmerCountingBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
324
330 explicit KmerCountingBloomFilter(const std::string& path);
331
334
335 KmerCountingBloomFilter& operator=(const KmerCountingBloomFilter&) = delete;
337
344 void insert(const char* seq, size_t seq_len);
345
351 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
352
360 void insert(const uint64_t* hashes, T n = 1)
361 {
362 counting_bloom_filter.insert(hashes, n);
363 }
364
371 void insert(const std::vector<uint64_t>& hashes, T n = 1)
372 {
373 counting_bloom_filter.insert(hashes, n);
374 }
375
382 void remove(const char* seq, size_t seq_len);
383
389 void remove(const std::string& seq) { remove(seq.c_str(), seq.size()); }
390
397 void remove(const uint64_t* hashes) { counting_bloom_filter.remove(hashes); }
398
404 void remove(const std::vector<uint64_t>& hashes)
405 {
406 counting_bloom_filter.remove(hashes);
407 }
408
415 void clear(const char* seq, size_t seq_len);
416
422 void clear(const std::string& seq) { clear(seq.c_str(), seq.size()); }
423
430 void clear(const uint64_t* hashes) { counting_bloom_filter.clear(hashes); }
431
437 void clear(const std::vector<uint64_t>& hashes)
438 {
439 counting_bloom_filter.clear(hashes);
440 }
441
450 uint64_t contains(const char* seq, size_t seq_len) const;
451
459 uint64_t contains(const std::string& seq) const
460 {
461 return contains(seq.c_str(), seq.size());
462 }
463
472 T contains(const uint64_t* hashes) const
473 {
474 return counting_bloom_filter.contains(hashes);
475 }
476
484 T contains(const std::vector<uint64_t>& hashes) const
485 {
486 return counting_bloom_filter.contains(hashes);
487 }
488
497 T contains_insert(const char* seq, size_t seq_len);
498
506 T contains_insert(const std::string& seq)
507 {
508 return contains_insert(seq.c_str(), seq.size());
509 }
510
520 T contains_insert(const uint64_t* hashes, T n = 1)
521 {
522 return counting_bloom_filter.contains_insert(hashes, n);
523 }
524
533 T contains_insert(const std::vector<uint64_t>& hashes, T n = 1)
534 {
535 return counting_bloom_filter.contains_insert(hashes, n);
536 }
537
546 T insert_contains(const char* seq, size_t seq_len);
547
555 T insert_contains(const std::string& seq)
556 {
557 return insert_contains(seq.c_str(), seq.size());
558 }
559
570 T insert_contains(const uint64_t* hashes, T n = 1)
571 {
572 return counting_bloom_filter.insert_contains(hashes, n);
573 }
574
583 T insert_contains(const std::vector<uint64_t>& hashes, T n = 1)
584 {
585 return counting_bloom_filter.insert_contains(hashes, n);
586 }
587
598 T insert_thresh_contains(const char* seq, size_t seq_len, T threshold);
599
609 T insert_thresh_contains(const std::string& seq, const T threshold)
610 {
611 return insert_thresh_contains(seq.c_str(), seq.size(), threshold);
612 }
613
625 T insert_thresh_contains(const uint64_t* hashes, const T threshold)
626 {
627 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
628 }
629
641 T insert_thresh_contains(const std::vector<uint64_t>& hashes,
642 const T threshold)
643 {
644 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
645 }
646
657 T contains_insert_thresh(const char* seq, size_t seq_len, T threshold);
658
668 T contains_insert_thresh(const std::string& seq, const T threshold)
669 {
670 return contains_insert_thresh(seq.c_str(), seq.size(), threshold);
671 }
672
684 T contains_insert_thresh(const uint64_t* hashes, const T threshold)
685 {
686 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
687 }
688
698 T contains_insert_thresh(const std::vector<uint64_t>& hashes,
699 const T threshold)
700 {
701 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
702 }
703
705 size_t get_bytes() const { return counting_bloom_filter.get_bytes(); }
707 uint64_t get_pop_cnt(T threshold = 1) const
708 {
709 return counting_bloom_filter.get_pop_cnt(threshold);
710 }
712 double get_occupancy(T threshold = 1) const
713 {
714 return counting_bloom_filter.get_occupancy(threshold);
715 }
717 unsigned get_hash_num() const { return counting_bloom_filter.get_hash_num(); }
724 double get_fpr(T threshold = 1) const
725 {
726 return counting_bloom_filter.get_fpr(threshold);
727 }
729 unsigned get_k() const { return k; }
731 const std::string& get_hash_fn() const
732 {
733 return counting_bloom_filter.get_hash_fn();
734 }
737 {
738 return counting_bloom_filter;
739 }
740
746 void save(const std::string& path);
747
754 static bool is_bloom_file(const std::string& path)
755 {
756 return btllib::BloomFilter::check_file_signature(
757 path, KMER_COUNTING_BLOOM_FILTER_SIGNATURE);
758 }
759
760private:
761 KmerCountingBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
762
763 unsigned k = 0;
764 CountingBloomFilter<T> counting_bloom_filter;
765};
766
767} // namespace btllib
768
769#include "counting_bloom_filter-inl.hpp"
770
771#endif
Definition counting_bloom_filter.hpp:43
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition counting_bloom_filter.hpp:218
void remove(const uint64_t *hashes)
Definition counting_bloom_filter-inl.hpp:89
double get_occupancy(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:184
void insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:82
const std::string & get_hash_fn() const
Definition counting_bloom_filter.hpp:269
static bool is_bloom_file(const std::string &path)
Definition counting_bloom_filter.hpp:283
double get_fpr(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:191
uint64_t get_pop_cnt(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:167
void save(const std::string &path)
Definition counting_bloom_filter-inl.hpp:232
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition counting_bloom_filter-inl.hpp:155
T contains(const std::vector< uint64_t > &hashes) const
Definition counting_bloom_filter.hpp:140
void remove(const std::vector< uint64_t > &hashes)
Definition counting_bloom_filter.hpp:106
size_t get_bytes() const
Definition counting_bloom_filter.hpp:253
T contains_insert(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:164
unsigned get_hash_num() const
Definition counting_bloom_filter.hpp:260
void insert(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:88
T contains(const uint64_t *hashes) const
Definition counting_bloom_filter-inl.hpp:105
CountingBloomFilter()
Definition counting_bloom_filter.hpp:47
void clear(const std::vector< uint64_t > &hashes)
Definition counting_bloom_filter.hpp:121
void clear(const uint64_t *hashes)
Definition counting_bloom_filter-inl.hpp:97
T insert_contains(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:189
T insert_contains(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:130
T contains_insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:119
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition counting_bloom_filter.hpp:246
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition counting_bloom_filter-inl.hpp:142
Definition counting_bloom_filter.hpp:310
uint64_t contains(const char *seq, size_t seq_len) const
Definition counting_bloom_filter-inl.hpp:299
void clear(const uint64_t *hashes)
Definition counting_bloom_filter.hpp:430
void clear(const std::string &seq)
Definition counting_bloom_filter.hpp:422
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition counting_bloom_filter.hpp:641
void clear(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:289
void save(const std::string &path)
Definition counting_bloom_filter-inl.hpp:388
void insert(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:371
CountingBloomFilter< T > & get_counting_bloom_filter()
Definition counting_bloom_filter.hpp:736
T insert_contains(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:323
T insert_thresh_contains(const std::string &seq, const T threshold)
Definition counting_bloom_filter.hpp:609
T contains_insert_thresh(const uint64_t *hashes, const T threshold)
Definition counting_bloom_filter.hpp:684
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition counting_bloom_filter-inl.hpp:335
T contains(const uint64_t *hashes) const
Definition counting_bloom_filter.hpp:472
void remove(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:279
T contains_insert(const std::string &seq)
Definition counting_bloom_filter.hpp:506
size_t get_bytes() const
Definition counting_bloom_filter.hpp:705
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition counting_bloom_filter-inl.hpp:350
T insert_thresh_contains(const uint64_t *hashes, const T threshold)
Definition counting_bloom_filter.hpp:625
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition counting_bloom_filter.hpp:698
T insert_contains(const std::string &seq)
Definition counting_bloom_filter.hpp:555
void remove(const std::vector< uint64_t > &hashes)
Definition counting_bloom_filter.hpp:404
const std::string & get_hash_fn() const
Definition counting_bloom_filter.hpp:731
T insert_contains(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:583
void insert(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:269
double get_occupancy(T threshold=1) const
Definition counting_bloom_filter.hpp:712
T contains_insert(const std::vector< uint64_t > &hashes, T n=1)
Definition counting_bloom_filter.hpp:533
T contains_insert(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:311
unsigned get_hash_num() const
Definition counting_bloom_filter.hpp:717
uint64_t get_pop_cnt(T threshold=1) const
Definition counting_bloom_filter.hpp:707
void insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter.hpp:360
double get_fpr(T threshold=1) const
Definition counting_bloom_filter.hpp:724
void remove(const std::string &seq)
Definition counting_bloom_filter.hpp:389
unsigned get_k() const
Definition counting_bloom_filter.hpp:729
void insert(const std::string &seq)
Definition counting_bloom_filter.hpp:351
T insert_contains(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter.hpp:570
uint64_t contains(const std::string &seq) const
Definition counting_bloom_filter.hpp:459
void remove(const uint64_t *hashes)
Definition counting_bloom_filter.hpp:397
void clear(const std::vector< uint64_t > &hashes)
Definition counting_bloom_filter.hpp:437
T contains_insert_thresh(const std::string &seq, const T threshold)
Definition counting_bloom_filter.hpp:668
static bool is_bloom_file(const std::string &path)
Definition counting_bloom_filter.hpp:754
KmerCountingBloomFilter()
Definition counting_bloom_filter.hpp:314
T contains_insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter.hpp:520
T contains(const std::vector< uint64_t > &hashes) const
Definition counting_bloom_filter.hpp:484
Definition aahash.hpp:12