1#ifndef BTLLIB_MI_BLOOM_FILTER_INL_HPP
2#define BTLLIB_MI_BLOOM_FILTER_INL_HPP
4#include "btllib/mi_bloom_filter.hpp"
5#include "btllib/nthash.hpp"
6#include "btllib/status.hpp"
13#include <sdsl/bit_vector_il.hpp>
14#include <sdsl/rank_support.hpp>
18MIBloomFilterInitializer::check_file_signature(
20 const std::string& expected_signature,
21 std::string& file_signature)
23 std::getline(ifs, file_signature);
24 return file_signature == expected_signature;
27std::shared_ptr<cpptoml::table>
28MIBloomFilterInitializer::parse_header(
const std::string& expected_signature)
32 "MIBloomFilterInitializer: failed to open " + path);
34 std::string file_signature;
35 if (!check_file_signature(ifs_id_arr, expected_signature, file_signature)) {
36 log_error(std::string(
"File signature does not match (possibly version "
37 "mismatch) for file:\n") +
38 path +
'\n' +
"Expected signature:\t" + expected_signature +
39 '\n' +
"File signature: \t" + file_signature);
40 std::exit(EXIT_FAILURE);
46 std::string toml_buffer(file_signature +
'\n');
48 bool header_end_found =
false;
49 while (
bool(std::getline(ifs_id_arr, line))) {
50 toml_buffer.append(line +
'\n');
51 if (line ==
"[HeaderEnd]") {
52 header_end_found =
true;
56 if (!header_end_found) {
57 log_error(
"Pre-built multi-index Bloom filter does not have the correct "
59 std::exit(EXIT_FAILURE);
61 for (
unsigned i = 0; i < PLACEHOLDER_NEWLINES_MIBF; i++) {
62 std::getline(ifs_id_arr, line);
66 std::istringstream toml_stream(toml_buffer);
67 cpptoml::parser toml_parser(toml_stream);
68 const auto header_config = toml_parser.parse();
71 const auto header_string =
72 file_signature.substr(1, file_signature.size() - 2);
73 return header_config->get_table(header_string);
77MIBloomFilter<T>::MIBloomFilter(
const std::string& path)
78 : MIBloomFilter<T>::MIBloomFilter(
79 std::make_shared<MIBloomFilterInitializer>(path,
80 MI_BLOOM_FILTER_SIGNATURE))
85inline MIBloomFilter<T>::MIBloomFilter(
86 const std::shared_ptr<MIBloomFilterInitializer>& mibfi)
88 *(mibfi->table->get_as<decltype(id_array_size)>(
"id_array_size")))
89 , kmer_size(*(mibfi->table->get_as<decltype(kmer_size)>(
"kmer_size")))
90 , hash_num(*(mibfi->table->get_as<decltype(hash_num)>(
"hash_num")))
91 , hash_fn(mibfi->table->contains(
"hash_fn")
92 ? *(mibfi->table->get_as<decltype(hash_fn)>(
"hash_fn"))
95 , id_array(new std::atomic<T>[id_array_size])
96 , bv_insertion_completed(
97 static_cast<bool>(*(mibfi->table->get_as<int>(
"bv_insertion_completed"))))
98 , id_insertion_completed(
99 static_cast<bool>(*(mibfi->table->get_as<int>(
"id_insertion_completed"))))
102 mibfi->ifs_id_arr.read((
char*)id_array.get(),
103 std::streamsize(id_array_size *
sizeof(T)));
105 sdsl::load_from_file(il_bit_vector, mibfi->path +
".sdsl");
106 bv_rank_support = sdsl::rank_support_il<1>(&il_bit_vector);
109 counts_array = std::unique_ptr<std::atomic<uint16_t>[]>(
110 new std::atomic<uint16_t>[id_array_size]);
112 (
void*)counts_array.get(), 0, id_array_size *
sizeof(counts_array[0]));
115 "MIBloomFilter: Bit vector size: " + std::to_string(il_bit_vector.size()) +
116 "\nPopcount: " + std::to_string(get_pop_cnt()));
120inline MIBloomFilter<T>::MIBloomFilter(
size_t bv_size,
125 , hash_fn(std::move(hash_fn))
127 bit_vector = sdsl::bit_vector(bv_size);
131inline MIBloomFilter<T>::MIBloomFilter(sdsl::bit_vector& bit_vector,
134 : bit_vector(bit_vector)
136 , hash_fn(std::move(hash_fn))
138 complete_bv_insertion();
143MIBloomFilter<T>::insert_bv(
const uint64_t* hashes)
145 assert(!bv_insertion_completed);
147 for (
unsigned i = 0; i < hash_num; ++i) {
148 uint64_t pos = hashes[i] % bit_vector.size();
149 uint64_t* data_index = bit_vector.data() + (pos >> 6);
150 uint64_t bit_mask_value = (uint64_t)1 << (pos & 0x3F);
151 (void)(__sync_fetch_and_or(data_index, bit_mask_value) >>
158MIBloomFilter<T>::bv_contains(
const uint64_t* hashes)
160 assert(bv_insertion_completed);
161 for (
unsigned i = 0; i < hash_num; i++) {
162 uint64_t pos = hashes[i] % il_bit_vector.size();
163 if (il_bit_vector[pos] == 0) {
171MIBloomFilter<T>::complete_bv_insertion()
173 assert(!id_insertion_completed);
174 bv_insertion_completed =
true;
176 il_bit_vector = sdsl::bit_vector_il<BLOCKSIZE>(bit_vector);
177 bv_rank_support = sdsl::rank_support_il<1>(&il_bit_vector);
178 id_array_size = get_pop_cnt();
180 std::unique_ptr<std::atomic<T>[]>(
new std::atomic<T>[id_array_size]);
181 std::memset((
void*)id_array.get(), 0, id_array_size *
sizeof(std::atomic<T>));
182 counts_array = std::unique_ptr<std::atomic<uint16_t>[]>(
183 new std::atomic<uint16_t>[id_array_size]);
185 (
void*)counts_array.get(), 0, id_array_size *
sizeof(counts_array[0]));
189MIBloomFilter<T>::insert_id(
const uint64_t* hashes,
const T&
id)
191 assert(bv_insertion_completed && !id_insertion_completed);
193 uint32_t rand = std::rand();
194 for (
unsigned i = 0; i < hash_num; ++i) {
195 uint64_t rank = get_rank_pos(hashes[i]);
196 uint16_t count = ++counts_array[rank];
197 T random_num = (rand ^ hashes[i]) % count;
198 if (random_num == count - 1) {
205MIBloomFilter<T>::get_id(
const uint64_t* hashes)
207 return get_data(get_rank_pos(hashes));
211MIBloomFilter<T>::insert_saturation(
const uint64_t* hashes,
const T&
id)
213 assert(id_insertion_completed);
214 std::vector<uint64_t> rank_pos = get_rank_pos(hashes);
215 std::vector<T> results = get_data(rank_pos);
216 std::vector<T> replacement_ids(hash_num);
217 bool value_found =
false;
218 std::vector<T> seen_set(hash_num);
220 for (
unsigned i = 0; i < hash_num; i++) {
221 T current_result = results[i] & (btllib::MIBloomFilter<T>::ANTI_MASK &
222 btllib::MIBloomFilter<T>::ANTI_STRAND);
224 if (current_result ==
id) {
229 if (find(seen_set.begin(), seen_set.end(), current_result) ==
231 seen_set.push_back(current_result);
235 replacement_ids.push_back(current_result);
240 uint64_t replacement_pos = id_array_size;
241 T min_count = std::numeric_limits<T>::min();
242 for (
unsigned i = 0; i < hash_num; i++) {
243 T current_result = results[i] & btllib::MIBloomFilter<T>::ANTI_MASK;
244 if (find(replacement_ids.begin(),
245 replacement_ids.end(),
246 current_result) != replacement_ids.end()) {
247 if (min_count < counts_array[rank_pos[i]]) {
248 min_count = counts_array[rank_pos[i]];
249 replacement_pos = rank_pos[i];
253 if (replacement_pos != id_array_size) {
254 set_data(replacement_pos,
id);
255 ++counts_array[replacement_pos];
257 set_saturated(hashes);
263MIBloomFilter<T>::set_data(
const uint64_t& pos,
const T&
id)
267 old_value = id_array[pos];
268 }
while (!(id_array[pos].compare_exchange_strong(
269 old_value, old_value > MASK ? (
id | MASK) : id)));
273MIBloomFilter<T>::set_saturated(
const uint64_t* hashes)
275 for (
unsigned i = 0; i < hash_num; ++i) {
276 uint64_t pos = bv_rank_support(hashes[i] % il_bit_vector.size());
277 id_array[pos].fetch_or(MASK);
281inline std::vector<uint64_t>
282MIBloomFilter<T>::get_rank_pos(
const uint64_t* hashes)
const
284 std::vector<uint64_t> rank_pos(hash_num);
285 for (
unsigned i = 0; i < hash_num; ++i) {
286 uint64_t pos = hashes[i] % il_bit_vector.size();
287 rank_pos[i] = bv_rank_support(pos);
293MIBloomFilter<T>::get_data(
const std::vector<uint64_t>& rank_pos)
const
295 std::vector<T> results(hash_num);
296 for (
unsigned i = 0; i < hash_num; ++i) {
297 results[i] = id_array[rank_pos[i]];
303MIBloomFilter<T>::save(
const std::string& path,
304 const cpptoml::table& table,
308 std::ofstream ofs(path.c_str(), std::ios::out | std::ios::binary);
310 ofs << table <<
"[HeaderEnd]\n";
311 for (
unsigned i = 0; i < PLACEHOLDER_NEWLINES_MIBF; i++) {
313 ofs <<
" <binary data>";
318 ofs.write(data, std::streamsize(n));
323MIBloomFilter<T>::save(
const std::string& path)
329 auto root = cpptoml::make_table();
333 auto header = cpptoml::make_table();
334 header->insert(
"id_array_size", id_array_size);
335 header->insert(
"hash_num", get_hash_num());
336 header->insert(
"kmer_size", get_k());
337 header->insert(
"bv_insertion_completed",
338 static_cast<int>(bv_insertion_completed));
339 header->insert(
"id_insertion_completed",
340 static_cast<int>(id_insertion_completed));
342 if (!hash_fn.empty()) {
343 header->insert(
"hash_fn", get_hash_fn());
345 std::string header_string = MI_BLOOM_FILTER_SIGNATURE;
347 header_string.substr(1, header_string.size() - 2);
348 root->insert(header_string, header);
349 save(path, *root, (
char*)id_array.get(), id_array_size *
sizeof(id_array[0]));
350 sdsl::store_to_file(il_bit_vector, path +
".sdsl");
355MIBloomFilter<T>::get_pop_cnt()
357 assert(bv_insertion_completed);
358 size_t index = il_bit_vector.size() - 1;
359 while (il_bit_vector[index] == 0) {
362 return bv_rank_support(index) + 1;
366MIBloomFilter<T>::get_pop_saturated_cnt()
369 for (
size_t i = 0; i < id_array_size; ++i) {
370 if (id_array[i] >= MASK) {
378inline std::vector<size_t>
379MIBloomFilter<T>::get_id_occurence_count(
const bool& include_saturated)
382 assert(bv_insertion_completed);
385 std::vector<std::atomic<size_t>> count_vec(MASK - 1);
388#pragma omp parallel for default(none) \
389 shared(id_array_size, include_saturated, id_array, count_vec)
390 for (
size_t k = 0; k < id_array_size; k++) {
391 if (!include_saturated && id_array[k] > ANTI_MASK) {
394 count_vec[id_array[k] & ANTI_MASK].fetch_add(1);
399 std::vector<size_t> result;
400 bool has_trailing_zeros =
true;
401 for (
size_t i = MASK - 2; i != SIZE_MAX; i--) {
402 if (count_vec[i] != 0) {
403 has_trailing_zeros =
false;
405 if (!has_trailing_zeros) {
406 result.insert(result.begin(), count_vec[i].load());
415MIBloomFilter<T>::calc_optimal_size(
size_t entries,
419 auto non_64_approx_val =
420 size_t(-
double(entries) *
double(hash_num) / log(1.0 - occupancy));
421 const int magic = 64;
422 return non_64_approx_val + (magic - non_64_approx_val % magic);
void check_error(bool condition, const std::string &msg)
void log_info(const std::string &msg)
void check_file_accessibility(const std::string &filepath)
void log_error(const std::string &msg)