btllib
 All Classes Namespaces Functions Variables
mi_bloom_filter.hpp
1 #ifndef BTLLIB_MI_BLOOM_FILTER_HPP
2 #define BTLLIB_MI_BLOOM_FILTER_HPP
3 
4 #include "nthash.hpp"
5 #include "status.hpp"
6 
7 // clang-format off
8 // NOLINTBEGIN llvm-include-order
9 #include <limits>
10 #include "cpptoml.h"
11 // NOLINTEND llvm-include-order
12 // clang-format on
13 
14 #include <climits>
15 #include <cstdlib>
16 
17 #include <sdsl/bit_vector_il.hpp>
18 #include <sdsl/rank_support.hpp>
19 
20 namespace btllib {
21 
22 static const char* const MI_BLOOM_FILTER_SIGNATURE = "[BTLMIBloomFilter_v2]";
23 
24 static const unsigned PLACEHOLDER_NEWLINES_MIBF = 50;
25 
26 class MIBloomFilterInitializer
27 {
28 
30 public:
31  MIBloomFilterInitializer(const std::string& path,
32  const std::string& signature)
33  : path(path)
34  , ifs_id_arr(path)
35  , table(parse_header(signature))
36  {
37  }
38 
39  static bool check_file_signature(std::ifstream& ifs,
40  const std::string& expected_signature,
41  std::string& file_signature);
42 
43  std::string path;
44  std::ifstream ifs_id_arr;
45  std::shared_ptr<cpptoml::table> table;
46 
47  MIBloomFilterInitializer(const MIBloomFilterInitializer&) = delete;
48  MIBloomFilterInitializer(MIBloomFilterInitializer&&) = default;
49 
50  MIBloomFilterInitializer& operator=(const MIBloomFilterInitializer&) = delete;
51  MIBloomFilterInitializer& operator=(MIBloomFilterInitializer&&) = default;
52 
53 private:
56  std::shared_ptr<cpptoml::table> parse_header(const std::string& signature);
57 };
59 
60 template<typename T>
61 class MIBloomFilter
62 {
63 public:
64  static const T MASK = T(1) << (sizeof(T) * 8 - 1);
65  static const T ANTI_MASK = (T)~MASK;
66 
67  static const T STRAND = T(1) << (sizeof(T) * 8 - 2);
68  static const T ANTI_STRAND = (T)~STRAND;
69 
70  static const T ID_MASK = ANTI_STRAND & ANTI_MASK;
71 
72  static const unsigned BLOCKSIZE = 512;
73 
76  MIBloomFilter() {}
77 
85  MIBloomFilter(size_t bv_size, unsigned hash_num, std::string hash_fn = "");
86 
94  MIBloomFilter(sdsl::bit_vector& bit_vector,
95  unsigned hash_num,
96  std::string hash_fn = "");
97 
103  explicit MIBloomFilter(const std::string& path);
104 
110  void complete_bv_insertion();
111 
115  void complete_id_insertion() { id_insertion_completed = true; }
116 
123  void insert_bv(const uint64_t* hashes);
124 
131  void insert_bv(const std::vector<uint64_t>& hashes)
132  {
133  insert_bv(hashes.data());
134  }
135 
143  bool bv_contains(const uint64_t* hashes);
144 
152  bool bv_contains(const std::vector<uint64_t>& hashes)
153  {
154  return bv_contains(hashes.data());
155  }
156 
166  void insert_id(const uint64_t* hashes, const T& id);
167 
176  void insert_id(const std::vector<uint64_t>& hashes, const T& id)
177  {
178  insert_id(hashes.data(), id);
179  }
180 
186  std::vector<T> get_id(const uint64_t* hashes);
187 
193  std::vector<T> get_id(const std::vector<uint64_t>& hashes)
194  {
195  return get_id(hashes.data());
196  }
197 
204  void insert_saturation(const uint64_t* hashes, const T& id);
205 
212  void insert_saturation(const std::vector<uint64_t>& hashes, const T& id)
213  {
214  insert_saturation(hashes.data(), id);
215  }
216 
223  void save(const std::string& path);
224 
226  uint64_t get_pop_cnt();
227 
230  uint64_t get_pop_saturated_cnt();
231 
233  unsigned get_hash_num() const { return hash_num; }
234 
236  unsigned get_k() const { return kmer_size; }
237 
239  const std::string& get_hash_fn() const { return hash_fn; }
240 
242  std::vector<size_t> get_id_occurence_count(const bool& include_saturated);
243 
246  static size_t calc_optimal_size(size_t entries,
247  unsigned hash_num,
248  double occupancy);
249 
250 private:
251  MIBloomFilter(const std::shared_ptr<MIBloomFilterInitializer>& mibfi);
252  static void save(const std::string& path,
253  const cpptoml::table& table,
254  const char* data,
255  size_t n);
256  std::vector<uint64_t> get_rank_pos(const uint64_t* hashes) const;
257  uint64_t get_rank_pos(const uint64_t hash) const
258  {
259  return bv_rank_support(hash % il_bit_vector.size());
260  }
261  std::vector<T> get_data(const std::vector<uint64_t>& rank_pos) const;
262  T get_data(const uint64_t& rank) const { return id_array[rank]; }
263  void set_data(const uint64_t& pos, const T& id);
264  void set_saturated(const uint64_t* hashes);
265 
266  size_t id_array_size = 0;
267  size_t bv_size = 0;
268  unsigned kmer_size = 0;
269  unsigned hash_num = 0;
270  std::string hash_fn;
271 
272  sdsl::bit_vector bit_vector;
273  sdsl::bit_vector_il<BLOCKSIZE> il_bit_vector;
274  sdsl::rank_support_il<1> bv_rank_support;
275  std::unique_ptr<std::atomic<uint16_t>[]> counts_array;
276  std::unique_ptr<std::atomic<T>[]> id_array;
277 
278  bool bv_insertion_completed = false, id_insertion_completed = false;
279 };
280 
281 } // namespace btllib
282 
283 #include "mi_bloom_filter-inl.hpp"
284 
285 #endif