1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
4#include "btllib/nthash.hpp"
23static const uint8_t BIT_MASKS[CHAR_BIT] = {
25 0x01, 0x02, 0x04, 0x08,
26 0x10, 0x20, 0x40, 0x80
29static const char*
const BLOOM_FILTER_SIGNATURE =
"[BTLBloomFilter_v6]";
30static const char*
const KMER_BLOOM_FILTER_SIGNATURE =
31 "[BTLKmerBloomFilter_v6]";
32static const char*
const SEED_BLOOM_FILTER_SIGNATURE =
33 "[BTLSeedBloomFilter_v6]";
34static const char*
const HASH_FN = NTHASH_FN_NAME;
36static const unsigned MAX_HASH_VALUES = 1024;
37static const unsigned PLACEHOLDER_NEWLINES = 50;
40class BloomFilterInitializer
44 BloomFilterInitializer(
const std::string& path,
const std::string& signature)
47 , table(parse_header(signature))
51 static bool check_file_signature(std::ifstream& ifs,
52 const std::string& expected_signature,
53 std::string& file_signature);
57 std::shared_ptr<cpptoml::table> table;
59 BloomFilterInitializer(
const BloomFilterInitializer&) =
delete;
60 BloomFilterInitializer(BloomFilterInitializer&&) =
default;
62 BloomFilterInitializer& operator=(
const BloomFilterInitializer&) =
delete;
63 BloomFilterInitializer& operator=(BloomFilterInitializer&&) =
default;
68 std::shared_ptr<cpptoml::table> parse_header(
const std::string& signature);
86 BloomFilter(
size_t bytes,
unsigned hash_num, std::string hash_fn =
"");
114 void insert(
const std::vector<uint64_t>& hashes) {
insert(hashes.data()); }
133 bool contains(
const std::vector<uint64_t>& hashes)
const
178 void save(
const std::string& path);
180 static void save(
const std::string& path,
181 const cpptoml::table& table,
192 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
195 static bool check_file_signature(
const std::string& path,
196 const std::string& signature);
199 BloomFilter(
const std::shared_ptr<BloomFilterInitializer>& bfi);
207 size_t array_bits = 0;
208 unsigned hash_num = 0;
210 std::unique_ptr<std::atomic<uint8_t>[]> array;
251 void insert(
const char* seq,
size_t seq_len);
258 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
273 void insert(
const std::vector<uint64_t>& hashes)
275 bloom_filter.
insert(hashes);
286 unsigned contains(
const char* seq,
size_t seq_len)
const;
297 return contains(seq.c_str(), seq.size());
308 return bloom_filter.
contains(hashes);
316 bool contains(
const std::vector<uint64_t>& hashes)
const
318 return bloom_filter.
contains(hashes);
379 unsigned get_k()
const {
return k; }
390 void save(
const std::string& path);
399 return btllib::BloomFilter::check_file_signature(
400 path, KMER_BLOOM_FILTER_SIGNATURE);
432 const std::vector<std::string>& seeds,
433 unsigned hash_num_per_seed);
454 void insert(
const char* seq,
size_t seq_len);
461 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
469 void insert(
const uint64_t* hashes) { kmer_bloom_filter.
insert(hashes); }
476 void insert(
const std::vector<uint64_t>& hashes)
478 kmer_bloom_filter.
insert(hashes);
491 std::vector<std::vector<unsigned>>
contains(
const char* seq,
492 size_t seq_len)
const;
503 std::vector<std::vector<unsigned>>
contains(
const std::string& seq)
const
505 return contains(seq.c_str(), seq.size());
517 return kmer_bloom_filter.
contains(hashes);
526 bool contains(
const std::vector<uint64_t>& hashes)
const
528 return kmer_bloom_filter.
contains(hashes);
605 const std::vector<std::string>&
get_seeds()
const {
return seeds; }
633 void save(
const std::string& path);
642 return btllib::BloomFilter::check_file_signature(
643 path, SEED_BLOOM_FILTER_SIGNATURE);
649 std::vector<std::string> seeds;
650 std::vector<btllib::hashing_internals::SpacedSeed> parsed_seeds;
Definition bloom_filter.hpp:73
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:133
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:114
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:190
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:171
unsigned get_hash_num() const
Definition bloom_filter.hpp:167
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition bloom_filter.hpp:161
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition bloom_filter.hpp:77
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:155
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition bloom_filter.hpp:217
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition bloom_filter.hpp:377
BloomFilter & get_bloom_filter()
Definition bloom_filter.hpp:383
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:273
void insert(const std::string &seq)
Definition bloom_filter.hpp:258
unsigned get_hash_num() const
Definition bloom_filter.hpp:375
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:397
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition bloom_filter.hpp:338
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition bloom_filter.hpp:371
bool contains_insert(const uint64_t *hashes)
Definition bloom_filter.hpp:351
KmerBloomFilter()
Definition bloom_filter.hpp:221
bool contains(const uint64_t *hashes) const
Definition bloom_filter.hpp:306
void insert(const uint64_t *hashes)
Definition bloom_filter.hpp:266
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:381
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:363
size_t get_bytes() const
Definition bloom_filter.hpp:369
double get_occupancy() const
Definition bloom_filter.hpp:373
unsigned contains(const std::string &seq) const
Definition bloom_filter.hpp:295
unsigned get_k() const
Definition bloom_filter.hpp:379
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:316
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition bloom_filter.hpp:416
unsigned get_total_hash_num() const
Definition bloom_filter.hpp:595
double get_occupancy() const
Definition bloom_filter.hpp:592
bool contains(const uint64_t *hashes) const
Definition bloom_filter.hpp:515
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition bloom_filter.hpp:555
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition bloom_filter.hpp:526
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
KmerBloomFilter & get_kmer_bloom_filter()
Definition bloom_filter.hpp:626
void insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:476
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition bloom_filter.hpp:582
static bool is_bloom_file(const std::string &path)
Definition bloom_filter.hpp:640
unsigned get_hash_num_per_seed() const
Definition bloom_filter.hpp:614
const std::vector< btllib::hashing_internals::SpacedSeed > & get_parsed_seeds() const
Definition bloom_filter.hpp:608
SeedBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition bloom_filter.hpp:590
size_t get_bytes() const
Definition bloom_filter.hpp:588
unsigned get_k() const
Definition bloom_filter.hpp:603
void insert(const uint64_t *hashes)
Definition bloom_filter.hpp:469
const std::vector< std::string > & get_seeds() const
Definition bloom_filter.hpp:605
const std::string & get_hash_fn() const
Definition bloom_filter.hpp:621
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition bloom_filter.hpp:420
unsigned get_hash_num() const
Definition bloom_filter.hpp:619
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition bloom_filter.hpp:503
void insert(const std::string &seq)
Definition bloom_filter.hpp:461
bool contains_insert(const uint64_t *hashes)
Definition bloom_filter.hpp:569