btllib
Loading...
Searching...
No Matches
bloom_filter.hpp
1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
3
4#include "btllib/nthash.hpp"
5
6// clang-format off
7// NOLINTBEGIN llvm-include-order
8#include <limits>
9#include "cpptoml.h"
10// NOLINTEND llvm-include-order
11// clang-format on
12
13#include <atomic>
14#include <climits>
15#include <cstdint>
16#include <fstream>
17#include <memory>
18#include <string>
19#include <vector>
20
21namespace btllib {
22
23static const uint8_t BIT_MASKS[CHAR_BIT] = {
24 // NOLINT
25 0x01, 0x02, 0x04, 0x08, // NOLINT
26 0x10, 0x20, 0x40, 0x80 // NOLINT
27};
28
29static const char* const BLOOM_FILTER_SIGNATURE = "[BTLBloomFilter_v6]";
30static const char* const KMER_BLOOM_FILTER_SIGNATURE =
31 "[BTLKmerBloomFilter_v6]";
32static const char* const SEED_BLOOM_FILTER_SIGNATURE =
33 "[BTLSeedBloomFilter_v6]";
34static const char* const HASH_FN = NTHASH_FN_NAME;
35
36static const unsigned MAX_HASH_VALUES = 1024;
37static const unsigned PLACEHOLDER_NEWLINES = 50;
38
40class BloomFilterInitializer
41{
42
43public:
44 BloomFilterInitializer(const std::string& path, const std::string& signature)
45 : path(path)
46 , ifs(path)
47 , table(parse_header(signature))
48 {
49 }
50
51 static bool check_file_signature(std::ifstream& ifs,
52 const std::string& expected_signature,
53 std::string& file_signature);
54
55 std::string path;
56 std::ifstream ifs;
57 std::shared_ptr<cpptoml::table> table;
58
59 BloomFilterInitializer(const BloomFilterInitializer&) = delete;
60 BloomFilterInitializer(BloomFilterInitializer&&) = default;
61
62 BloomFilterInitializer& operator=(const BloomFilterInitializer&) = delete;
63 BloomFilterInitializer& operator=(BloomFilterInitializer&&) = default;
64
65private:
68 std::shared_ptr<cpptoml::table> parse_header(const std::string& signature);
69};
71
73{
74
75public:
78
86 BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn = "");
87
93 explicit BloomFilter(const std::string& path);
94
95 BloomFilter(const BloomFilter&) = delete;
96 BloomFilter(BloomFilter&&) = delete;
97
98 BloomFilter& operator=(const BloomFilter&) = delete;
99 BloomFilter& operator=(BloomFilter&&) = delete;
100
107 void insert(const uint64_t* hashes);
108
114 void insert(const std::vector<uint64_t>& hashes) { insert(hashes.data()); }
115
124 bool contains(const uint64_t* hashes) const;
125
133 bool contains(const std::vector<uint64_t>& hashes) const
134 {
135 return contains(hashes.data());
136 }
137
146 bool contains_insert(const uint64_t* hashes);
147
155 bool contains_insert(const std::vector<uint64_t>& hashes)
156 {
157 return contains_insert(hashes.data());
158 }
159
161 size_t get_bytes() const { return bytes; }
163 uint64_t get_pop_cnt() const;
165 double get_occupancy() const;
167 unsigned get_hash_num() const { return hash_num; }
169 double get_fpr() const;
171 const std::string& get_hash_fn() const { return hash_fn; }
172
178 void save(const std::string& path);
179
180 static void save(const std::string& path,
181 const cpptoml::table& table,
182 const char* data,
183 size_t n);
184
190 static bool is_bloom_file(const std::string& path)
191 {
192 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
193 }
194
195 static bool check_file_signature(const std::string& path,
196 const std::string& signature);
197
198private:
199 BloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
200
201 friend class KmerBloomFilter;
202 friend class SeedBloomFilter;
203
204 size_t bytes = 0;
205 size_t array_size =
206 0; // Should be equal to bytes, but not guaranteed by standard
207 size_t array_bits = 0;
208 unsigned hash_num = 0;
209 std::string hash_fn;
210 std::unique_ptr<std::atomic<uint8_t>[]> array;
211};
212
217{
218
219public:
222
230 KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
231
237 explicit KmerBloomFilter(const std::string& path);
238
239 KmerBloomFilter(const KmerBloomFilter&) = delete;
241
242 KmerBloomFilter& operator=(const KmerBloomFilter&) = delete;
243 KmerBloomFilter& operator=(KmerBloomFilter&&) = delete;
244
251 void insert(const char* seq, size_t seq_len);
252
258 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
259
266 void insert(const uint64_t* hashes) { bloom_filter.insert(hashes); }
267
273 void insert(const std::vector<uint64_t>& hashes)
274 {
275 bloom_filter.insert(hashes);
276 }
277
286 unsigned contains(const char* seq, size_t seq_len) const;
287
295 unsigned contains(const std::string& seq) const
296 {
297 return contains(seq.c_str(), seq.size());
298 }
299
306 bool contains(const uint64_t* hashes) const
307 {
308 return bloom_filter.contains(hashes);
309 }
310
316 bool contains(const std::vector<uint64_t>& hashes) const
317 {
318 return bloom_filter.contains(hashes);
319 }
320
329 unsigned contains_insert(const char* seq, size_t seq_len);
330
338 unsigned contains_insert(const std::string& seq)
339 {
340 return contains_insert(seq.c_str(), seq.size());
341 }
342
351 bool contains_insert(const uint64_t* hashes)
352 {
353 return bloom_filter.contains_insert(hashes);
354 }
355
363 bool contains_insert(const std::vector<uint64_t>& hashes)
364 {
365 return bloom_filter.contains_insert(hashes);
366 }
367
369 size_t get_bytes() const { return bloom_filter.get_bytes(); }
371 uint64_t get_pop_cnt() const { return bloom_filter.get_pop_cnt(); }
373 double get_occupancy() const { return bloom_filter.get_occupancy(); }
375 unsigned get_hash_num() const { return bloom_filter.get_hash_num(); }
377 double get_fpr() const { return bloom_filter.get_fpr(); }
379 unsigned get_k() const { return k; }
381 const std::string& get_hash_fn() const { return bloom_filter.get_hash_fn(); }
383 BloomFilter& get_bloom_filter() { return bloom_filter; }
384
390 void save(const std::string& path);
391
397 static bool is_bloom_file(const std::string& path)
398 {
399 return btllib::BloomFilter::check_file_signature(
400 path, KMER_BLOOM_FILTER_SIGNATURE);
401 }
402
403private:
404 KmerBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
405
406 friend class SeedBloomFilter;
407
408 unsigned k = 0;
409 BloomFilter bloom_filter;
410};
411
416{
417
418public:
421
430 SeedBloomFilter(size_t bytes,
431 unsigned k,
432 const std::vector<std::string>& seeds,
433 unsigned hash_num_per_seed);
434
440 explicit SeedBloomFilter(const std::string& path);
441
442 SeedBloomFilter(const SeedBloomFilter&) = delete;
444
445 SeedBloomFilter& operator=(const SeedBloomFilter&) = delete;
446 SeedBloomFilter& operator=(SeedBloomFilter&&) = delete;
447
454 void insert(const char* seq, size_t seq_len);
455
461 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
462
469 void insert(const uint64_t* hashes) { kmer_bloom_filter.insert(hashes); }
470
476 void insert(const std::vector<uint64_t>& hashes)
477 {
478 kmer_bloom_filter.insert(hashes);
479 }
480
491 std::vector<std::vector<unsigned>> contains(const char* seq,
492 size_t seq_len) const;
493
503 std::vector<std::vector<unsigned>> contains(const std::string& seq) const
504 {
505 return contains(seq.c_str(), seq.size());
506 }
507
515 bool contains(const uint64_t* hashes) const
516 {
517 return kmer_bloom_filter.contains(hashes);
518 }
519
526 bool contains(const std::vector<uint64_t>& hashes) const
527 {
528 return kmer_bloom_filter.contains(hashes);
529 }
530
542 std::vector<std::vector<unsigned>> contains_insert(const char* seq,
543 size_t seq_len);
544
555 std::vector<std::vector<unsigned>> contains_insert(const std::string& seq)
556 {
557 return contains_insert(seq.c_str(), seq.size());
558 }
559
569 bool contains_insert(const uint64_t* hashes)
570 {
571 return kmer_bloom_filter.contains_insert(hashes);
572 }
573
582 bool contains_insert(const std::vector<uint64_t>& hashes)
583 {
584 return kmer_bloom_filter.contains_insert(hashes);
585 }
586
588 size_t get_bytes() const { return kmer_bloom_filter.get_bytes(); }
590 uint64_t get_pop_cnt() const { return kmer_bloom_filter.get_pop_cnt(); }
592 double get_occupancy() const { return kmer_bloom_filter.get_occupancy(); }
595 unsigned get_total_hash_num() const
596 {
597 return get_hash_num_per_seed() * get_seeds().size();
598 }
601 double get_fpr() const;
603 unsigned get_k() const { return kmer_bloom_filter.get_k(); }
605 const std::vector<std::string>& get_seeds() const { return seeds; }
608 const std::vector<btllib::hashing_internals::SpacedSeed>& get_parsed_seeds()
609 const
610 {
611 return parsed_seeds;
612 }
614 unsigned get_hash_num_per_seed() const
615 {
616 return kmer_bloom_filter.get_hash_num();
617 }
619 unsigned get_hash_num() const { return get_hash_num_per_seed(); }
621 const std::string& get_hash_fn() const
622 {
623 return kmer_bloom_filter.get_hash_fn();
624 }
626 KmerBloomFilter& get_kmer_bloom_filter() { return kmer_bloom_filter; }
627
633 void save(const std::string& path);
634
640 static bool is_bloom_file(const std::string& path)
641 {
642 return btllib::BloomFilter::check_file_signature(
643 path, SEED_BLOOM_FILTER_SIGNATURE);
644 }
645
646private:
647 SeedBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
648
649 std::vector<std::string> seeds;
650 std::vector<btllib::hashing_internals::SpacedSeed> parsed_seeds;
651 KmerBloomFilter kmer_bloom_filter;
652};
653
654} // namespace btllib
655
656#endif
Definition bloom_filter.hpp:73
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:133
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:114
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:190
double get_fpr() const
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:171
unsigned get_hash_num() const
Definition bloom_filter.hpp:167
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition bloom_filter.hpp:161
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition bloom_filter.hpp:77
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:155
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition bloom_filter.hpp:217
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition bloom_filter.hpp:377
BloomFilter & get_bloom_filter()
Definition bloom_filter.hpp:383
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:273
void insert(const std::string &seq)
Definition bloom_filter.hpp:258
unsigned get_hash_num() const
Definition bloom_filter.hpp:375
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:397
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition bloom_filter.hpp:338
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition bloom_filter.hpp:371
bool contains_insert(const uint64_t *hashes)
Definition bloom_filter.hpp:351
KmerBloomFilter()
Definition bloom_filter.hpp:221
bool contains(const uint64_t *hashes) const
Definition bloom_filter.hpp:306
void insert(const uint64_t *hashes)
Definition bloom_filter.hpp:266
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:381
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:363
size_t get_bytes() const
Definition bloom_filter.hpp:369
double get_occupancy() const
Definition bloom_filter.hpp:373
unsigned contains(const std::string &seq) const
Definition bloom_filter.hpp:295
unsigned get_k() const
Definition bloom_filter.hpp:379
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:316
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition bloom_filter.hpp:416
unsigned get_total_hash_num() const
Definition bloom_filter.hpp:595
double get_occupancy() const
Definition bloom_filter.hpp:592
bool contains(const uint64_t *hashes) const
Definition bloom_filter.hpp:515
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition bloom_filter.hpp:555
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:526
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
KmerBloomFilter & get_kmer_bloom_filter()
Definition bloom_filter.hpp:626
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:476
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:582
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:640
unsigned get_hash_num_per_seed() const
Definition bloom_filter.hpp:614
const std::vector< btllib::hashing_internals::SpacedSeed > & get_parsed_seeds() const
Definition bloom_filter.hpp:608
SeedBloomFilter(const std::string &path)
double get_fpr() const
uint64_t get_pop_cnt() const
Definition bloom_filter.hpp:590
size_t get_bytes() const
Definition bloom_filter.hpp:588
unsigned get_k() const
Definition bloom_filter.hpp:603
void insert(const uint64_t *hashes)
Definition bloom_filter.hpp:469
const std::vector< std::string > & get_seeds() const
Definition bloom_filter.hpp:605
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:621
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition bloom_filter.hpp:420
unsigned get_hash_num() const
Definition bloom_filter.hpp:619
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition bloom_filter.hpp:503
void insert(const std::string &seq)
Definition bloom_filter.hpp:461
bool contains_insert(const uint64_t *hashes)
Definition bloom_filter.hpp:569
Definition aahash.hpp:12