btllib
Loading...
Searching...
No Matches
mi_bloom_filter.hpp
1#ifndef BTLLIB_MI_BLOOM_FILTER_HPP
2#define BTLLIB_MI_BLOOM_FILTER_HPP
3
4#include "nthash.hpp"
5#include "status.hpp"
6
7// clang-format off
8// NOLINTBEGIN llvm-include-order
9#include <limits>
10#include "cpptoml.h"
11// NOLINTEND llvm-include-order
12// clang-format on
13
14#include <climits>
15#include <cstdlib>
16
17#include <sdsl/bit_vector_il.hpp>
18#include <sdsl/rank_support.hpp>
19
20namespace btllib {
21
22static const char* const MI_BLOOM_FILTER_SIGNATURE = "[BTLMIBloomFilter_v2]";
23
24static const unsigned PLACEHOLDER_NEWLINES_MIBF = 50;
25
26class MIBloomFilterInitializer
27{
28
30public:
31 MIBloomFilterInitializer(const std::string& path,
32 const std::string& signature)
33 : path(path)
34 , ifs_id_arr(path)
35 , table(parse_header(signature))
36 {
37 }
38
39 static bool check_file_signature(std::ifstream& ifs,
40 const std::string& expected_signature,
41 std::string& file_signature);
42
43 std::string path;
44 std::ifstream ifs_id_arr;
45 std::shared_ptr<cpptoml::table> table;
46
47 MIBloomFilterInitializer(const MIBloomFilterInitializer&) = delete;
48 MIBloomFilterInitializer(MIBloomFilterInitializer&&) = default;
49
50 MIBloomFilterInitializer& operator=(const MIBloomFilterInitializer&) = delete;
51 MIBloomFilterInitializer& operator=(MIBloomFilterInitializer&&) = default;
52
53private:
56 std::shared_ptr<cpptoml::table> parse_header(const std::string& signature);
57};
59
60template<typename T>
61class MIBloomFilter
62{
63public:
64 static const T MASK = T(1) << (sizeof(T) * 8 - 1);
65 static const T ANTI_MASK = (T)~MASK;
66
67 static const T STRAND = T(1) << (sizeof(T) * 8 - 2);
68 static const T ANTI_STRAND = (T)~STRAND;
69
70 static const T ID_MASK = ANTI_STRAND & ANTI_MASK;
71
72 static const unsigned BLOCKSIZE = 512;
73
76 MIBloomFilter() {}
77
85 MIBloomFilter(size_t bv_size, unsigned hash_num, std::string hash_fn = "");
86
94 MIBloomFilter(sdsl::bit_vector& bit_vector,
95 unsigned hash_num,
96 std::string hash_fn = "");
97
103 explicit MIBloomFilter(const std::string& path);
104
110 void complete_bv_insertion();
111
115 void complete_id_insertion() { id_insertion_completed = true; }
116
123 void insert_bv(const uint64_t* hashes);
124
131 void insert_bv(const std::vector<uint64_t>& hashes)
132 {
133 insert_bv(hashes.data());
134 }
135
143 bool bv_contains(const uint64_t* hashes);
144
152 bool bv_contains(const std::vector<uint64_t>& hashes)
153 {
154 return bv_contains(hashes.data());
155 }
156
166 void insert_id(const uint64_t* hashes, const T& id);
167
176 void insert_id(const std::vector<uint64_t>& hashes, const T& id)
177 {
178 insert_id(hashes.data(), id);
179 }
180
186 std::vector<T> get_id(const uint64_t* hashes);
187
193 std::vector<T> get_id(const std::vector<uint64_t>& hashes)
194 {
195 return get_id(hashes.data());
196 }
197
204 void insert_saturation(const uint64_t* hashes, const T& id);
205
212 void insert_saturation(const std::vector<uint64_t>& hashes, const T& id)
213 {
214 insert_saturation(hashes.data(), id);
215 }
216
223 void save(const std::string& path);
224
226 uint64_t get_pop_cnt();
227
230 uint64_t get_pop_saturated_cnt();
231
233 unsigned get_hash_num() const { return hash_num; }
234
236 unsigned get_k() const { return kmer_size; }
237
239 const std::string& get_hash_fn() const { return hash_fn; }
240
242 std::vector<size_t> get_id_occurence_count(const bool& include_saturated);
243
246 static size_t calc_optimal_size(size_t entries,
247 unsigned hash_num,
248 double occupancy);
249
250private:
251 MIBloomFilter(const std::shared_ptr<MIBloomFilterInitializer>& mibfi);
252 static void save(const std::string& path,
253 const cpptoml::table& table,
254 const char* data,
255 size_t n);
256 std::vector<uint64_t> get_rank_pos(const uint64_t* hashes) const;
257 uint64_t get_rank_pos(const uint64_t hash) const
258 {
259 return bv_rank_support(hash % il_bit_vector.size());
260 }
261 std::vector<T> get_data(const std::vector<uint64_t>& rank_pos) const;
262 T get_data(const uint64_t& rank) const { return id_array[rank]; }
263 void set_data(const uint64_t& pos, const T& id);
264 void set_saturated(const uint64_t* hashes);
265
266 size_t id_array_size = 0;
267 size_t bv_size = 0;
268 unsigned kmer_size = 0;
269 unsigned hash_num = 0;
270 std::string hash_fn;
271
272 sdsl::bit_vector bit_vector;
273 sdsl::bit_vector_il<BLOCKSIZE> il_bit_vector;
274 sdsl::rank_support_il<1> bv_rank_support;
275 std::unique_ptr<std::atomic<uint16_t>[]> counts_array;
276 std::unique_ptr<std::atomic<T>[]> id_array;
277
278 bool bv_insertion_completed = false, id_insertion_completed = false;
279};
280
281} // namespace btllib
282
283#include "mi_bloom_filter-inl.hpp"
284
285#endif
Definition aahash.hpp:12