1#ifndef BTLLIB_MI_BLOOM_FILTER_HPP
2#define BTLLIB_MI_BLOOM_FILTER_HPP
17#include <sdsl/bit_vector_il.hpp>
18#include <sdsl/rank_support.hpp>
22static const char*
const MI_BLOOM_FILTER_SIGNATURE =
"[BTLMIBloomFilter_v2]";
24static const unsigned PLACEHOLDER_NEWLINES_MIBF = 50;
26class MIBloomFilterInitializer
31 MIBloomFilterInitializer(
const std::string& path,
32 const std::string& signature)
35 , table(parse_header(signature))
39 static bool check_file_signature(std::ifstream& ifs,
40 const std::string& expected_signature,
41 std::string& file_signature);
44 std::ifstream ifs_id_arr;
45 std::shared_ptr<cpptoml::table> table;
47 MIBloomFilterInitializer(
const MIBloomFilterInitializer&) =
delete;
48 MIBloomFilterInitializer(MIBloomFilterInitializer&&) =
default;
50 MIBloomFilterInitializer& operator=(
const MIBloomFilterInitializer&) =
delete;
51 MIBloomFilterInitializer& operator=(MIBloomFilterInitializer&&) =
default;
56 std::shared_ptr<cpptoml::table> parse_header(
const std::string& signature);
64 static const T MASK = T(1) << (
sizeof(T) * 8 - 1);
65 static const T ANTI_MASK = (T)~MASK;
67 static const T STRAND = T(1) << (
sizeof(T) * 8 - 2);
68 static const T ANTI_STRAND = (T)~STRAND;
70 static const T ID_MASK = ANTI_STRAND & ANTI_MASK;
72 static const unsigned BLOCKSIZE = 512;
85 MIBloomFilter(
size_t bv_size,
unsigned hash_num, std::string hash_fn =
"");
94 MIBloomFilter(sdsl::bit_vector& bit_vector,
96 std::string hash_fn =
"");
103 explicit MIBloomFilter(
const std::string& path);
110 void complete_bv_insertion();
115 void complete_id_insertion() { id_insertion_completed =
true; }
123 void insert_bv(
const uint64_t* hashes);
131 void insert_bv(
const std::vector<uint64_t>& hashes)
133 insert_bv(hashes.data());
143 bool bv_contains(
const uint64_t* hashes);
152 bool bv_contains(
const std::vector<uint64_t>& hashes)
154 return bv_contains(hashes.data());
166 void insert_id(
const uint64_t* hashes,
const T&
id);
176 void insert_id(
const std::vector<uint64_t>& hashes,
const T&
id)
178 insert_id(hashes.data(),
id);
186 std::vector<T> get_id(
const uint64_t* hashes);
193 std::vector<T> get_id(
const std::vector<uint64_t>& hashes)
195 return get_id(hashes.data());
204 void insert_saturation(
const uint64_t* hashes,
const T&
id);
212 void insert_saturation(
const std::vector<uint64_t>& hashes,
const T&
id)
214 insert_saturation(hashes.data(),
id);
223 void save(
const std::string& path);
226 uint64_t get_pop_cnt();
230 uint64_t get_pop_saturated_cnt();
233 unsigned get_hash_num()
const {
return hash_num; }
236 unsigned get_k()
const {
return kmer_size; }
239 const std::string& get_hash_fn()
const {
return hash_fn; }
242 std::vector<size_t> get_id_occurence_count(
const bool& include_saturated);
246 static size_t calc_optimal_size(
size_t entries,
251 MIBloomFilter(
const std::shared_ptr<MIBloomFilterInitializer>& mibfi);
252 static void save(
const std::string& path,
253 const cpptoml::table& table,
256 std::vector<uint64_t> get_rank_pos(
const uint64_t* hashes)
const;
257 uint64_t get_rank_pos(
const uint64_t hash)
const
259 return bv_rank_support(hash % il_bit_vector.size());
261 std::vector<T> get_data(
const std::vector<uint64_t>& rank_pos)
const;
262 T get_data(
const uint64_t& rank)
const {
return id_array[rank]; }
263 void set_data(
const uint64_t& pos,
const T&
id);
264 void set_saturated(
const uint64_t* hashes);
266 size_t id_array_size = 0;
268 unsigned kmer_size = 0;
269 unsigned hash_num = 0;
272 sdsl::bit_vector bit_vector;
273 sdsl::bit_vector_il<BLOCKSIZE> il_bit_vector;
274 sdsl::rank_support_il<1> bv_rank_support;
275 std::unique_ptr<std::atomic<uint16_t>[]> counts_array;
276 std::unique_ptr<std::atomic<T>[]> id_array;
278 bool bv_insertion_completed =
false, id_insertion_completed =
false;
283#include "mi_bloom_filter-inl.hpp"