btllib
 All Classes Namespaces Functions Variables
counting_bloom_filter.hpp
1 #ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP
2 #define BTLLIB_COUNTING_BLOOM_FILTER_HPP
3 
4 #include "btllib/bloom_filter.hpp"
5 #include "btllib/counting_bloom_filter.hpp"
6 #include "btllib/nthash.hpp"
7 #include "btllib/status.hpp"
8 
9 // clang-format off
10 // NOLINTBEGIN llvm-include-order
11 #include <limits>
12 #include "cpptoml.h"
13 // NOLINTEND llvm-include-order
14 // clang-format on
15 
16 #include <atomic>
17 #include <cmath>
18 #include <cstdint>
19 #include <fstream>
20 #include <memory>
21 #include <string>
22 #include <vector>
23 
24 namespace btllib {
25 
26 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
27 static const char* const COUNTING_BLOOM_FILTER_SIGNATURE =
28  "[BTLCountingBloomFilter_v5]";
29 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
30 static const char* const KMER_COUNTING_BLOOM_FILTER_SIGNATURE =
31  "[BTLKmerCountingBloomFilter_v5]";
32 
33 template<typename T>
35 
41 template<typename T>
43 {
44 
45 public:
48 
56  CountingBloomFilter(size_t bytes,
57  unsigned hash_num,
58  std::string hash_fn = "");
59 
65  explicit CountingBloomFilter(const std::string& path);
66 
69 
70  CountingBloomFilter& operator=(const CountingBloomFilter&) = delete;
71  CountingBloomFilter& operator=(CountingBloomFilter&&) = delete;
72 
80  void insert(const uint64_t* hashes, T n = 1);
81 
88  void insert(const std::vector<uint64_t>& hashes, T n = 1)
89  {
90  insert(hashes.data(), n);
91  }
92 
99  void remove(const uint64_t* hashes);
100 
106  void remove(const std::vector<uint64_t>& hashes) { remove(hashes.data()); }
107 
114  void clear(const uint64_t* hashes);
115 
121  void clear(const std::vector<uint64_t>& hashes) { clear(hashes.data()); }
122 
131  T contains(const uint64_t* hashes) const;
132 
140  T contains(const std::vector<uint64_t>& hashes) const
141  {
142  return contains(hashes.data());
143  }
144 
154  T contains_insert(const uint64_t* hashes, T n = 1);
155 
164  T contains_insert(const std::vector<uint64_t>& hashes, T n = 1)
165  {
166  return contains_insert(hashes.data(), n);
167  }
168 
179  T insert_contains(const uint64_t* hashes, T n = 1);
180 
189  T insert_contains(const std::vector<uint64_t>& hashes, T n = 1)
190  {
191  return insert_contains(hashes.data(), n);
192  }
193 
205  T insert_thresh_contains(const uint64_t* hashes, T threshold);
206 
218  T insert_thresh_contains(const std::vector<uint64_t>& hashes,
219  const T threshold)
220  {
221  return insert_thresh_contains(hashes.data(), threshold);
222  }
223 
235  T contains_insert_thresh(const uint64_t* hashes, T threshold);
236 
246  T contains_insert_thresh(const std::vector<uint64_t>& hashes,
247  const T threshold)
248  {
249  return contains_insert_thresh(hashes.data(), threshold);
250  }
251 
253  size_t get_bytes() const { return bytes; }
256  uint64_t get_pop_cnt(T threshold = 1) const;
258  double get_occupancy(T threshold = 1) const;
260  unsigned get_hash_num() const { return hash_num; }
267  double get_fpr(T threshold = 1) const;
269  const std::string& get_hash_fn() const { return hash_fn; }
270 
276  void save(const std::string& path);
277 
283  static bool is_bloom_file(const std::string& path)
284  {
285  return btllib::BloomFilter::check_file_signature(
286  path, COUNTING_BLOOM_FILTER_SIGNATURE);
287  }
288 
289 private:
290  CountingBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
291 
292  void set(const uint64_t* hashes, T min_val, T new_val);
293 
294  friend class KmerCountingBloomFilter<T>;
295 
296  size_t bytes = 0;
297  size_t array_size = 0;
298  unsigned hash_num = 0;
299  std::string hash_fn;
300  std::unique_ptr<std::atomic<T>[]> array;
301 };
302 
308 template<typename T>
309 class KmerCountingBloomFilter
310 {
311 
312 public:
315 
323  KmerCountingBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
324 
330  explicit KmerCountingBloomFilter(const std::string& path);
331 
334 
335  KmerCountingBloomFilter& operator=(const KmerCountingBloomFilter&) = delete;
336  KmerCountingBloomFilter& operator=(KmerCountingBloomFilter&&) = delete;
337 
344  void insert(const char* seq, size_t seq_len);
345 
351  void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
352 
360  void insert(const uint64_t* hashes, T n = 1)
361  {
362  counting_bloom_filter.insert(hashes, n);
363  }
364 
371  void insert(const std::vector<uint64_t>& hashes, T n = 1)
372  {
373  counting_bloom_filter.insert(hashes, n);
374  }
375 
382  void remove(const char* seq, size_t seq_len);
383 
389  void remove(const std::string& seq) { remove(seq.c_str(), seq.size()); }
390 
397  void remove(const uint64_t* hashes) { counting_bloom_filter.remove(hashes); }
398 
404  void remove(const std::vector<uint64_t>& hashes)
405  {
406  counting_bloom_filter.remove(hashes);
407  }
408 
415  void clear(const char* seq, size_t seq_len);
416 
422  void clear(const std::string& seq) { clear(seq.c_str(), seq.size()); }
423 
430  void clear(const uint64_t* hashes) { counting_bloom_filter.clear(hashes); }
431 
437  void clear(const std::vector<uint64_t>& hashes)
438  {
439  counting_bloom_filter.clear(hashes);
440  }
441 
450  uint64_t contains(const char* seq, size_t seq_len) const;
451 
459  uint64_t contains(const std::string& seq) const
460  {
461  return contains(seq.c_str(), seq.size());
462  }
463 
472  T contains(const uint64_t* hashes) const
473  {
474  return counting_bloom_filter.contains(hashes);
475  }
476 
484  T contains(const std::vector<uint64_t>& hashes) const
485  {
486  return counting_bloom_filter.contains(hashes);
487  }
488 
497  T contains_insert(const char* seq, size_t seq_len);
498 
506  T contains_insert(const std::string& seq)
507  {
508  return contains_insert(seq.c_str(), seq.size());
509  }
510 
520  T contains_insert(const uint64_t* hashes, T n = 1)
521  {
522  return counting_bloom_filter.contains_insert(hashes, n);
523  }
524 
533  T contains_insert(const std::vector<uint64_t>& hashes, T n = 1)
534  {
535  return counting_bloom_filter.contains_insert(hashes, n);
536  }
537 
546  T insert_contains(const char* seq, size_t seq_len);
547 
555  T insert_contains(const std::string& seq)
556  {
557  return insert_contains(seq.c_str(), seq.size());
558  }
559 
570  T insert_contains(const uint64_t* hashes, T n = 1)
571  {
572  return counting_bloom_filter.insert_contains(hashes, n);
573  }
574 
583  T insert_contains(const std::vector<uint64_t>& hashes, T n = 1)
584  {
585  return counting_bloom_filter.insert_contains(hashes, n);
586  }
587 
598  T insert_thresh_contains(const char* seq, size_t seq_len, T threshold);
599 
609  T insert_thresh_contains(const std::string& seq, const T threshold)
610  {
611  return insert_thresh_contains(seq.c_str(), seq.size(), threshold);
612  }
613 
625  T insert_thresh_contains(const uint64_t* hashes, const T threshold)
626  {
627  return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
628  }
629 
641  T insert_thresh_contains(const std::vector<uint64_t>& hashes,
642  const T threshold)
643  {
644  return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
645  }
646 
657  T contains_insert_thresh(const char* seq, size_t seq_len, T threshold);
658 
668  T contains_insert_thresh(const std::string& seq, const T threshold)
669  {
670  return contains_insert_thresh(seq.c_str(), seq.size(), threshold);
671  }
672 
684  T contains_insert_thresh(const uint64_t* hashes, const T threshold)
685  {
686  return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
687  }
688 
698  T contains_insert_thresh(const std::vector<uint64_t>& hashes,
699  const T threshold)
700  {
701  return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
702  }
703 
705  size_t get_bytes() const { return counting_bloom_filter.get_bytes(); }
707  uint64_t get_pop_cnt(T threshold = 1) const
708  {
709  return counting_bloom_filter.get_pop_cnt(threshold);
710  }
712  double get_occupancy(T threshold = 1) const
713  {
714  return counting_bloom_filter.get_occupancy(threshold);
715  }
717  unsigned get_hash_num() const { return counting_bloom_filter.get_hash_num(); }
724  double get_fpr(T threshold = 1) const
725  {
726  return counting_bloom_filter.get_fpr(threshold);
727  }
729  unsigned get_k() const { return k; }
731  const std::string& get_hash_fn() const
732  {
733  return counting_bloom_filter.get_hash_fn();
734  }
737  {
738  return counting_bloom_filter;
739  }
740 
746  void save(const std::string& path);
747 
754  static bool is_bloom_file(const std::string& path)
755  {
756  return btllib::BloomFilter::check_file_signature(
757  path, KMER_COUNTING_BLOOM_FILTER_SIGNATURE);
758  }
759 
760 private:
761  KmerCountingBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
762 
763  unsigned k = 0;
764  CountingBloomFilter<T> counting_bloom_filter;
765 };
766 
767 } // namespace btllib
768 
769 #include "counting_bloom_filter-inl.hpp"
770 
771 #endif
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter.hpp:712
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter-inl.hpp:191
T contains_insert(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter-inl.hpp:119
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:283
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter.hpp:724
void save(const std::string &path)
Definition: counting_bloom_filter-inl.hpp:232
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter-inl.hpp:155
void insert(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter-inl.hpp:82
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:705
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter-inl.hpp:142
T contains_insert(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:533
void clear(const uint64_t *hashes)
Definition: counting_bloom_filter-inl.hpp:97
Definition: counting_bloom_filter.hpp:42
void clear(const char *seq, size_t seq_len)
Definition: counting_bloom_filter-inl.hpp:289
void insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:351
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:731
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:641
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:140
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter-inl.hpp:350
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter.hpp:707
uint64_t contains(const char *seq, size_t seq_len) const
Definition: counting_bloom_filter-inl.hpp:299
T insert_contains(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter.hpp:570
CountingBloomFilter()
Definition: counting_bloom_filter.hpp:47
T contains_insert_thresh(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:668
T insert_contains(const std::string &seq)
Definition: counting_bloom_filter.hpp:555
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:698
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:484
void clear(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:430
T insert_thresh_contains(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:625
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter-inl.hpp:105
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:260
void insert(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter.hpp:360
void clear(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:121
void clear(const std::string &seq)
Definition: counting_bloom_filter.hpp:422
KmerCountingBloomFilter()
Definition: counting_bloom_filter.hpp:314
T insert_contains(const char *seq, size_t seq_len)
Definition: counting_bloom_filter-inl.hpp:323
Definition: counting_bloom_filter.hpp:34
T contains_insert_thresh(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:684
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:269
CountingBloomFilter< T > & get_counting_bloom_filter()
Definition: counting_bloom_filter.hpp:736
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter-inl.hpp:167
T insert_contains(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:583
T contains_insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter-inl.hpp:311
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter-inl.hpp:184
uint64_t contains(const std::string &seq) const
Definition: counting_bloom_filter.hpp:459
T contains_insert(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter.hpp:520
T contains_insert(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:164
T insert_thresh_contains(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:609
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter-inl.hpp:335
T insert_contains(const uint64_t *hashes, T n=1)
Definition: counting_bloom_filter-inl.hpp:130
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:246
void insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter-inl.hpp:269
void insert(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:88
unsigned get_k() const
Definition: counting_bloom_filter.hpp:729
T insert_contains(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:189
void clear(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:437
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:717
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:253
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:754
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:218
void insert(const std::vector< uint64_t > &hashes, T n=1)
Definition: counting_bloom_filter.hpp:371
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:472
T contains_insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:506
void save(const std::string &path)
Definition: counting_bloom_filter-inl.hpp:388