btllib
 All Classes Namespaces Functions Variables
bloom_filter.hpp
1 #ifndef BTLLIB_BLOOM_FILTER_HPP
2 #define BTLLIB_BLOOM_FILTER_HPP
3 
4 #include "btllib/nthash.hpp"
5 
6 // clang-format off
7 // NOLINTBEGIN llvm-include-order
8 #include <limits>
9 #include "cpptoml.h"
10 // NOLINTEND llvm-include-order
11 // clang-format on
12 
13 #include <atomic>
14 #include <climits>
15 #include <cstdint>
16 #include <fstream>
17 #include <memory>
18 #include <string>
19 #include <vector>
20 
21 namespace btllib {
22 
23 static const uint8_t BIT_MASKS[CHAR_BIT] = {
24  // NOLINT
25  0x01, 0x02, 0x04, 0x08, // NOLINT
26  0x10, 0x20, 0x40, 0x80 // NOLINT
27 };
28 
29 static const char* const BLOOM_FILTER_SIGNATURE = "[BTLBloomFilter_v6]";
30 static const char* const KMER_BLOOM_FILTER_SIGNATURE =
31  "[BTLKmerBloomFilter_v6]";
32 static const char* const SEED_BLOOM_FILTER_SIGNATURE =
33  "[BTLSeedBloomFilter_v6]";
34 static const char* const HASH_FN = NTHASH_FN_NAME;
35 
36 static const unsigned MAX_HASH_VALUES = 1024;
37 static const unsigned PLACEHOLDER_NEWLINES = 50;
38 
40 class BloomFilterInitializer
41 {
42 
43 public:
44  BloomFilterInitializer(const std::string& path, const std::string& signature)
45  : path(path)
46  , ifs(path)
47  , table(parse_header(signature))
48  {
49  }
50 
51  static bool check_file_signature(std::ifstream& ifs,
52  const std::string& expected_signature,
53  std::string& file_signature);
54 
55  std::string path;
56  std::ifstream ifs;
57  std::shared_ptr<cpptoml::table> table;
58 
59  BloomFilterInitializer(const BloomFilterInitializer&) = delete;
60  BloomFilterInitializer(BloomFilterInitializer&&) = default;
61 
62  BloomFilterInitializer& operator=(const BloomFilterInitializer&) = delete;
63  BloomFilterInitializer& operator=(BloomFilterInitializer&&) = default;
64 
65 private:
68  std::shared_ptr<cpptoml::table> parse_header(const std::string& signature);
69 };
71 
73 {
74 
75 public:
78 
86  BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn = "");
87 
93  explicit BloomFilter(const std::string& path);
94 
95  BloomFilter(const BloomFilter&) = delete;
96  BloomFilter(BloomFilter&&) = delete;
97 
98  BloomFilter& operator=(const BloomFilter&) = delete;
99  BloomFilter& operator=(BloomFilter&&) = delete;
100 
107  void insert(const uint64_t* hashes);
108 
114  void insert(const std::vector<uint64_t>& hashes) { insert(hashes.data()); }
115 
124  bool contains(const uint64_t* hashes) const;
125 
133  bool contains(const std::vector<uint64_t>& hashes) const
134  {
135  return contains(hashes.data());
136  }
137 
146  bool contains_insert(const uint64_t* hashes);
147 
155  bool contains_insert(const std::vector<uint64_t>& hashes)
156  {
157  return contains_insert(hashes.data());
158  }
159 
161  size_t get_bytes() const { return bytes; }
163  uint64_t get_pop_cnt() const;
165  double get_occupancy() const;
167  unsigned get_hash_num() const { return hash_num; }
169  double get_fpr() const;
171  const std::string& get_hash_fn() const { return hash_fn; }
172 
178  void save(const std::string& path);
179 
180  static void save(const std::string& path,
181  const cpptoml::table& table,
182  const char* data,
183  size_t n);
184 
190  static bool is_bloom_file(const std::string& path)
191  {
192  return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
193  }
194 
195  static bool check_file_signature(const std::string& path,
196  const std::string& signature);
197 
198 private:
199  BloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
200 
201  friend class KmerBloomFilter;
202  friend class SeedBloomFilter;
203 
204  size_t bytes = 0;
205  size_t array_size =
206  0; // Should be equal to bytes, but not guaranteed by standard
207  size_t array_bits = 0;
208  unsigned hash_num = 0;
209  std::string hash_fn;
210  std::unique_ptr<std::atomic<uint8_t>[]> array;
211 };
212 
217 {
218 
219 public:
222 
230  KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
231 
237  explicit KmerBloomFilter(const std::string& path);
238 
239  KmerBloomFilter(const KmerBloomFilter&) = delete;
240  KmerBloomFilter(KmerBloomFilter&&) = delete;
241 
242  KmerBloomFilter& operator=(const KmerBloomFilter&) = delete;
243  KmerBloomFilter& operator=(KmerBloomFilter&&) = delete;
244 
251  void insert(const char* seq, size_t seq_len);
252 
258  void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
259 
266  void insert(const uint64_t* hashes) { bloom_filter.insert(hashes); }
267 
273  void insert(const std::vector<uint64_t>& hashes)
274  {
275  bloom_filter.insert(hashes);
276  }
277 
286  unsigned contains(const char* seq, size_t seq_len) const;
287 
295  unsigned contains(const std::string& seq) const
296  {
297  return contains(seq.c_str(), seq.size());
298  }
299 
306  bool contains(const uint64_t* hashes) const
307  {
308  return bloom_filter.contains(hashes);
309  }
310 
316  bool contains(const std::vector<uint64_t>& hashes) const
317  {
318  return bloom_filter.contains(hashes);
319  }
320 
329  unsigned contains_insert(const char* seq, size_t seq_len);
330 
338  unsigned contains_insert(const std::string& seq)
339  {
340  return contains_insert(seq.c_str(), seq.size());
341  }
342 
351  bool contains_insert(const uint64_t* hashes)
352  {
353  return bloom_filter.contains_insert(hashes);
354  }
355 
363  bool contains_insert(const std::vector<uint64_t>& hashes)
364  {
365  return bloom_filter.contains_insert(hashes);
366  }
367 
369  size_t get_bytes() const { return bloom_filter.get_bytes(); }
371  uint64_t get_pop_cnt() const { return bloom_filter.get_pop_cnt(); }
373  double get_occupancy() const { return bloom_filter.get_occupancy(); }
375  unsigned get_hash_num() const { return bloom_filter.get_hash_num(); }
377  double get_fpr() const { return bloom_filter.get_fpr(); }
379  unsigned get_k() const { return k; }
381  const std::string& get_hash_fn() const { return bloom_filter.get_hash_fn(); }
383  BloomFilter& get_bloom_filter() { return bloom_filter; }
384 
390  void save(const std::string& path);
391 
397  static bool is_bloom_file(const std::string& path)
398  {
399  return btllib::BloomFilter::check_file_signature(
400  path, KMER_BLOOM_FILTER_SIGNATURE);
401  }
402 
403 private:
404  KmerBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
405 
406  friend class SeedBloomFilter;
407 
408  unsigned k = 0;
409  BloomFilter bloom_filter;
410 };
411 
416 {
417 
418 public:
421 
430  SeedBloomFilter(size_t bytes,
431  unsigned k,
432  const std::vector<std::string>& seeds,
433  unsigned hash_num_per_seed);
434 
440  explicit SeedBloomFilter(const std::string& path);
441 
442  SeedBloomFilter(const SeedBloomFilter&) = delete;
443  SeedBloomFilter(SeedBloomFilter&&) = delete;
444 
445  SeedBloomFilter& operator=(const SeedBloomFilter&) = delete;
446  SeedBloomFilter& operator=(SeedBloomFilter&&) = delete;
447 
454  void insert(const char* seq, size_t seq_len);
455 
461  void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
462 
469  void insert(const uint64_t* hashes) { kmer_bloom_filter.insert(hashes); }
470 
476  void insert(const std::vector<uint64_t>& hashes)
477  {
478  kmer_bloom_filter.insert(hashes);
479  }
480 
491  std::vector<std::vector<unsigned>> contains(const char* seq,
492  size_t seq_len) const;
493 
503  std::vector<std::vector<unsigned>> contains(const std::string& seq) const
504  {
505  return contains(seq.c_str(), seq.size());
506  }
507 
515  bool contains(const uint64_t* hashes) const
516  {
517  return kmer_bloom_filter.contains(hashes);
518  }
519 
526  bool contains(const std::vector<uint64_t>& hashes) const
527  {
528  return kmer_bloom_filter.contains(hashes);
529  }
530 
542  std::vector<std::vector<unsigned>> contains_insert(const char* seq,
543  size_t seq_len);
544 
555  std::vector<std::vector<unsigned>> contains_insert(const std::string& seq)
556  {
557  return contains_insert(seq.c_str(), seq.size());
558  }
559 
569  bool contains_insert(const uint64_t* hashes)
570  {
571  return kmer_bloom_filter.contains_insert(hashes);
572  }
573 
582  bool contains_insert(const std::vector<uint64_t>& hashes)
583  {
584  return kmer_bloom_filter.contains_insert(hashes);
585  }
586 
588  size_t get_bytes() const { return kmer_bloom_filter.get_bytes(); }
590  uint64_t get_pop_cnt() const { return kmer_bloom_filter.get_pop_cnt(); }
592  double get_occupancy() const { return kmer_bloom_filter.get_occupancy(); }
595  unsigned get_total_hash_num() const
596  {
597  return get_hash_num_per_seed() * get_seeds().size();
598  }
601  double get_fpr() const;
603  unsigned get_k() const { return kmer_bloom_filter.get_k(); }
605  const std::vector<std::string>& get_seeds() const { return seeds; }
608  const std::vector<btllib::hashing_internals::SpacedSeed>& get_parsed_seeds()
609  const
610  {
611  return parsed_seeds;
612  }
614  unsigned get_hash_num_per_seed() const
615  {
616  return kmer_bloom_filter.get_hash_num();
617  }
619  unsigned get_hash_num() const { return get_hash_num_per_seed(); }
621  const std::string& get_hash_fn() const
622  {
623  return kmer_bloom_filter.get_hash_fn();
624  }
626  KmerBloomFilter& get_kmer_bloom_filter() { return kmer_bloom_filter; }
627 
633  void save(const std::string& path);
634 
640  static bool is_bloom_file(const std::string& path)
641  {
642  return btllib::BloomFilter::check_file_signature(
643  path, SEED_BLOOM_FILTER_SIGNATURE);
644  }
645 
646 private:
647  SeedBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
648 
649  std::vector<std::string> seeds;
650  std::vector<btllib::hashing_internals::SpacedSeed> parsed_seeds;
651  KmerBloomFilter kmer_bloom_filter;
652 };
653 
654 } // namespace btllib
655 
656 #endif
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:590
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:338
Definition: bloom_filter.hpp:415
unsigned get_hash_num() const
Definition: bloom_filter.hpp:375
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:569
double get_occupancy() const
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:503
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:469
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:155
size_t get_bytes() const
Definition: bloom_filter.hpp:161
uint64_t get_pop_cnt() const
void insert(const char *seq, size_t seq_len)
void insert(const uint64_t *hashes)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:582
void insert(const char *seq, size_t seq_len)
double get_occupancy() const
Definition: bloom_filter.hpp:373
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:383
BloomFilter()
Definition: bloom_filter.hpp:77
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:295
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:397
const std::vector< btllib::hashing_internals::SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:608
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:640
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:526
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:515
unsigned get_hash_num() const
Definition: bloom_filter.hpp:619
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:266
size_t get_bytes() const
Definition: bloom_filter.hpp:369
bool contains_insert(const uint64_t *hashes)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:371
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
void insert(const std::string &seq)
Definition: bloom_filter.hpp:461
double get_fpr() const
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:595
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:626
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:273
bool contains(const uint64_t *hashes) const
void save(const std::string &path)
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:171
unsigned get_hash_num() const
Definition: bloom_filter.hpp:167
Definition: bloom_filter.hpp:216
Definition: bloom_filter.hpp:72
double get_fpr() const
Definition: bloom_filter.hpp:377
size_t get_bytes() const
Definition: bloom_filter.hpp:588
void insert(const std::string &seq)
Definition: bloom_filter.hpp:258
unsigned get_k() const
Definition: bloom_filter.hpp:379
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:114
unsigned get_k() const
Definition: bloom_filter.hpp:603
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:316
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:621
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:363
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:476
void save(const std::string &path)
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:306
double get_fpr() const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:133
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:605
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:555
double get_occupancy() const
Definition: bloom_filter.hpp:592
void save(const std::string &path)
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:381
SeedBloomFilter()
Definition: bloom_filter.hpp:420
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:614
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:190
KmerBloomFilter()
Definition: bloom_filter.hpp:221
unsigned contains_insert(const char *seq, size_t seq_len)
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:351
unsigned contains(const char *seq, size_t seq_len) const