1#ifndef BTLLIB_COUNTING_BLOOM_FILTER_INL_HPP
2#define BTLLIB_COUNTING_BLOOM_FILTER_INL_HPP
4#include "btllib/bloom_filter.hpp"
5#include "btllib/counting_bloom_filter.hpp"
6#include "btllib/nthash.hpp"
7#include "btllib/status.hpp"
22using CountingBloomFilter8 = CountingBloomFilter<uint8_t>;
23using CountingBloomFilter16 = CountingBloomFilter<uint16_t>;
24using CountingBloomFilter32 = CountingBloomFilter<uint32_t>;
26using KmerCountingBloomFilter8 = KmerCountingBloomFilter<uint8_t>;
27using KmerCountingBloomFilter16 = KmerCountingBloomFilter<uint16_t>;
28using KmerCountingBloomFilter32 = KmerCountingBloomFilter<uint32_t>;
35 size_t(std::ceil(double(bytes) / sizeof(uint64_t)) * sizeof(uint64_t)))
36 , array_size(get_bytes() / sizeof(array[0]))
38 , hash_fn(std::move(hash_fn))
39 , array(new std::atomic<T>[array_size])
41 check_error(bytes == 0,
"CountingBloomFilter: memory budget must be >0!");
43 "CountingBloomFilter: number of hash values must be >0!");
45 hash_num > MAX_HASH_VALUES,
46 "CountingBloomFilter: number of hash values cannot be over 1024!");
47 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
48 "Atomic primitives take extra memory. CountingBloomFilter will "
50 std::to_string(bytes) +
" for bit array.");
51 std::memset((
void*)array.get(), 0, array_size *
sizeof(array[0]));
62 bool update_done =
false;
65 for (
size_t i = 0; i < hash_num; ++i) {
66 tmp_min_val = min_val;
67 update_done |= array[hashes[i] % array_size].compare_exchange_strong(
68 tmp_min_val, new_val);
73 min_val = contains(hashes);
74 if (min_val == std::numeric_limits<T>::max()) {
84 contains_insert(hashes, n);
91 T min_val = contains(hashes);
92 set(hashes, min_val, min_val > 1 ? min_val - 1 : 0);
99 T min_val = contains(hashes);
100 set(hashes, min_val, 0);
107 T min = array[hashes[0] % array_size];
108 for (
size_t i = 1; i < hash_num; ++i) {
109 const size_t idx = hashes[i] % array_size;
110 if (array[idx] < min) {
121 const auto count = contains(hashes);
122 if (count <= std::numeric_limits<T>::max() - n) {
123 set(hashes, count, count + n);
132 const auto count = contains(hashes);
133 if (count <= std::numeric_limits<T>::max() + n) {
134 set(hashes, count, count + n);
137 return std::numeric_limits<T>::max();
145 const auto count = contains(hashes);
146 if (count < threshold) {
147 set(hashes, count, count + 1);
158 const auto count = contains(hashes);
159 if (count < threshold) {
160 set(hashes, count, count + 1);
169 uint64_t pop_cnt = 0;
173#pragma omp parallel for reduction(+ : pop_cnt)
174 for (
size_t i = 0; i < array_size; ++i) {
175 if (array[i] >= threshold) {
186 return double(get_pop_cnt(threshold)) / double(array_size);
193 return std::pow(get_occupancy(threshold),
double(hash_num));
199 std::make_shared<BloomFilterInitializer>(path,
200 COUNTING_BLOOM_FILTER_SIGNATURE))
206 const std::shared_ptr<BloomFilterInitializer>& bfi)
207 : bytes(*bfi->table->get_as<decltype(bytes)>(
"bytes"))
208 , array_size(bytes / sizeof(array[0]))
209 , hash_num(*(bfi->table->get_as<decltype(hash_num)>(
"hash_num")))
210 , hash_fn(bfi->table->contains(
"hash_fn")
211 ? *(bfi->table->get_as<decltype(hash_fn)>(
"hash_fn"))
213 , array(new std::atomic<T>[array_size])
215 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
216 "Atomic primitives take extra memory. CountingBloomFilter will "
218 std::to_string(bytes) +
" for bit array.");
219 const auto loaded_counter_bits =
220 *(bfi->table->get_as<
size_t>(
"counter_bits"));
221 check_error(
sizeof(array[0]) * CHAR_BIT != loaded_counter_bits,
222 "CountingBloomFilter" +
223 std::to_string(
sizeof(array[0]) * CHAR_BIT) +
224 " tried to load a file of CountingBloomFilter" +
225 std::to_string(loaded_counter_bits));
226 bfi->ifs.read((
char*)array.get(),
227 std::streamsize(array_size *
sizeof(array[0])));
238 auto root = cpptoml::make_table();
242 auto header = cpptoml::make_table();
243 header->insert(
"bytes", get_bytes());
244 header->insert(
"hash_num", get_hash_num());
245 if (!hash_fn.empty()) {
246 header->insert(
"hash_fn", hash_fn);
248 header->insert(
"counter_bits",
size_t(
sizeof(array[0]) * CHAR_BIT));
249 std::string header_string = COUNTING_BLOOM_FILTER_SIGNATURE;
251 header_string.substr(1, header_string.size() - 2);
252 root->insert(header_string, header);
255 path, *root, (
char*)array.get(), array_size *
sizeof(array[0]));
263 , counting_bloom_filter(bytes, hash_num, HASH_FN)
271 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
272 while (nthash.
roll()) {
273 counting_bloom_filter.insert(nthash.
hashes());
281 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
282 while (nthash.
roll()) {
283 counting_bloom_filter.remove(nthash.
hashes());
291 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
292 while (nthash.
roll()) {
293 counting_bloom_filter.clear(nthash.
hashes());
302 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
303 while (nthash.
roll()) {
304 sum += counting_bloom_filter.contains(nthash.
hashes());
314 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
315 while (nthash.
roll()) {
316 sum += counting_bloom_filter.contains_insert(nthash.
hashes());
326 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
327 while (nthash.
roll()) {
328 sum += counting_bloom_filter.insert_contains(nthash.
hashes());
340 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
341 while (nthash.
roll()) {
343 counting_bloom_filter.insert_thresh_contains(nthash.
hashes(), threshold);
355 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
356 while (nthash.
roll()) {
358 counting_bloom_filter.contains_insert_thresh(nthash.
hashes(), threshold);
365 const std::string& path)
367 std::make_shared<BloomFilterInitializer>(
369 KMER_COUNTING_BLOOM_FILTER_SIGNATURE))
375 const std::shared_ptr<BloomFilterInitializer>& bfi)
376 : k(*(bfi->table->get_as<decltype(k)>(
"k")))
377 , counting_bloom_filter(bfi)
379 check_error(counting_bloom_filter.hash_fn != HASH_FN,
380 "KmerCountingBloomFilter: loaded hash function (" +
381 counting_bloom_filter.hash_fn +
382 ") is different from the one used by default (" + HASH_FN +
394 auto root = cpptoml::make_table();
398 auto header = cpptoml::make_table();
399 header->insert(
"bytes", get_bytes());
400 header->insert(
"hash_num", get_hash_num());
401 header->insert(
"hash_fn", get_hash_fn());
402 header->insert(
"counter_bits",
403 size_t(
sizeof(counting_bloom_filter.array[0]) * CHAR_BIT));
404 header->insert(
"k", k);
405 std::string header_string = KMER_COUNTING_BLOOM_FILTER_SIGNATURE;
407 header_string.substr(1, header_string.size() - 2);
408 root->insert(header_string, header);
412 (
char*)counting_bloom_filter.array.get(),
413 counting_bloom_filter.array_size *
414 sizeof(counting_bloom_filter.array[0]));
void save(const std::string &path)
Definition counting_bloom_filter.hpp:43
void remove(const uint64_t *hashes)
Definition counting_bloom_filter-inl.hpp:89
double get_occupancy(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:184
void insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:82
double get_fpr(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:191
uint64_t get_pop_cnt(T threshold=1) const
Definition counting_bloom_filter-inl.hpp:167
void save(const std::string &path)
Definition counting_bloom_filter-inl.hpp:232
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition counting_bloom_filter-inl.hpp:155
T contains(const uint64_t *hashes) const
Definition counting_bloom_filter-inl.hpp:105
CountingBloomFilter()
Definition counting_bloom_filter.hpp:47
void clear(const uint64_t *hashes)
Definition counting_bloom_filter-inl.hpp:97
T insert_contains(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:130
T contains_insert(const uint64_t *hashes, T n=1)
Definition counting_bloom_filter-inl.hpp:119
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition counting_bloom_filter-inl.hpp:142
Definition counting_bloom_filter.hpp:310
uint64_t contains(const char *seq, size_t seq_len) const
Definition counting_bloom_filter-inl.hpp:299
void clear(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:289
void save(const std::string &path)
Definition counting_bloom_filter-inl.hpp:388
T insert_contains(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:323
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition counting_bloom_filter-inl.hpp:335
void remove(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:279
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition counting_bloom_filter-inl.hpp:350
void insert(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:269
T contains_insert(const char *seq, size_t seq_len)
Definition counting_bloom_filter-inl.hpp:311
KmerCountingBloomFilter()
Definition counting_bloom_filter.hpp:314
Definition nthash_kmer.hpp:237
bool roll()
Definition nthash_kmer.hpp:315
const uint64_t * hashes() const
Definition nthash_kmer.hpp:443
void check_error(bool condition, const std::string &msg)
void check_warning(bool condition, const std::string &msg)