btllib
 All Classes Namespaces Functions Variables
nthash_seed.hpp
1 #pragma once
2 
3 #include <array>
4 #include <cstdint>
5 #include <cstring>
6 #include <deque>
7 #include <memory>
8 #include <string>
9 #include <string_view>
10 #include <vector>
11 
12 #include <btllib/hashing_internals.hpp>
13 #include <btllib/status.hpp>
14 
15 namespace btllib::hashing_internals {
16 
17 using SpacedSeed = std::vector<unsigned>;
18 using SpacedSeedBlocks = std::vector<std::array<unsigned, 2>>;
19 using SpacedSeedMonomers = std::vector<unsigned>;
20 
21 inline void
22 get_blocks(const std::vector<std::string>& seed_strings,
23  std::vector<SpacedSeedBlocks>& blocks,
24  std::vector<SpacedSeedMonomers>& monomers)
25 {
26  for (const auto& seed_string : seed_strings) {
27  const char pad = seed_string[seed_string.length() - 1] == '1' ? '0' : '1';
28  const std::string padded_string = seed_string + pad;
29  SpacedSeedBlocks care_blocks, ignore_blocks;
30  std::vector<unsigned> care_monos, ignore_monos;
31  unsigned i_start = 0;
32  bool is_care_block = padded_string[0] == '1';
33  for (unsigned pos = 0; pos < padded_string.length(); pos++) {
34  if (is_care_block && padded_string[pos] == '0') {
35  if (pos - i_start == 1) {
36  care_monos.push_back(i_start);
37  } else {
38  const std::array<unsigned, 2> block{ { i_start, pos } };
39  care_blocks.push_back(block);
40  }
41  i_start = pos;
42  is_care_block = false;
43  } else if (!is_care_block && padded_string[pos] == '1') {
44  if (pos - i_start == 1) {
45  ignore_monos.push_back(i_start);
46  } else {
47  const std::array<unsigned, 2> block{ { i_start, pos } };
48  ignore_blocks.push_back(block);
49  }
50  i_start = pos;
51  is_care_block = true;
52  }
53  }
54  const unsigned num_cares = care_blocks.size() * 2 + care_monos.size();
55  const unsigned num_ignores =
56  ignore_blocks.size() * 2 + ignore_monos.size() + 2;
57  if (num_ignores < num_cares) {
58  const unsigned string_end = seed_string.length();
59  const std::array<unsigned, 2> block{ { 0, string_end } };
60  ignore_blocks.push_back(block);
61  blocks.push_back(ignore_blocks);
62  monomers.push_back(ignore_monos);
63  } else {
64  blocks.push_back(care_blocks);
65  monomers.push_back(care_monos);
66  }
67  }
68 }
69 
70 inline void
71 parsed_seeds_to_blocks(const std::vector<std::vector<unsigned>>& seeds,
72  unsigned k,
73  std::vector<SpacedSeedBlocks>& blocks,
74  std::vector<SpacedSeedMonomers>& monomers)
75 {
76  std::vector<std::string> seed_strings;
77  for (const auto& seed : seeds) {
78  std::string seed_string(k, '1');
79  for (const auto& i : seed) {
80  seed_string[i] = '0';
81  }
82  seed_strings.push_back(seed_string);
83  }
84  get_blocks(seed_strings, blocks, monomers);
85 }
86 
87 inline void
88 check_seeds(const std::vector<std::string>& seeds, unsigned k)
89 {
90  for (const auto& seed : seeds) {
91  btllib::check_error(seed.length() != k,
92  "SeedNtHash: Spaced seed string length (" +
93  std::to_string(seed.length()) + ") not equal to k=" +
94  std::to_string(k) + " in " + seed);
95  const std::string reversed(seed.rbegin(), seed.rend());
97  seed != reversed,
98  "SeedNtHash: Seed " + seed +
99  " is not symmetric, reverse-complement hashing will be inconsistent");
100  }
101 }
102 
127 inline bool
128 ntmsm64(const char* kmer_seq,
129  const std::vector<SpacedSeedBlocks>& seeds_blocks,
130  const std::vector<SpacedSeedMonomers>& seeds_monomers,
131  unsigned k,
132  unsigned m,
133  unsigned m2,
134  uint64_t* fh_nomonos,
135  uint64_t* rh_nomonos,
136  uint64_t* fh_val,
137  uint64_t* rh_val,
138  unsigned& loc_n,
139  uint64_t* h_val)
140 {
141  unsigned i_base;
142  uint64_t fh_seed, rh_seed;
143  for (unsigned i_seed = 0; i_seed < m; i_seed++) {
144  fh_seed = 0;
145  rh_seed = 0;
146  for (const auto& block : seeds_blocks[i_seed]) {
147  uint8_t fh_loc, rh_loc, d;
148  uint64_t x;
149  unsigned i = block[0];
150  unsigned j = block[1];
151  switch (j - i) {
152  case 2: {
153  fh_loc = (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 2) | // NOLINT
154  (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]]); // NOLINT
155  rh_loc =
156  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
157  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]); // NOLINT
158  x = DIMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
159  d = k - i - 2; // NOLINT
160  fh_seed ^= d > 0 ? srol(x, d) : x;
161  x = DIMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
162  d = i;
163  rh_seed ^= d > 0 ? srol(x, d) : x;
164  } break;
165  case 3: {
166  fh_loc =
167  (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 4) | // NOLINT
168  (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
169  (CONVERT_TAB[(unsigned char)kmer_seq[i + 2]]); // NOLINT
170  rh_loc =
171  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 4) | // NOLINT
172  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
173  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]);
174  x = TRIMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
175  d = k - i - 3; // NOLINT
176  fh_seed ^= d > 0 ? srol(x, d) : x;
177  x = TRIMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
178  d = i;
179  rh_seed ^= d > 0 ? srol(x, d) : x;
180  } break;
181  case 4: {
182  fh_loc =
183  (CONVERT_TAB[(unsigned char)kmer_seq[i]] << 6) | // NOLINT
184  (CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 4) | // NOLINT
185  (CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 2) | // NOLINT
186  (CONVERT_TAB[(unsigned char)kmer_seq[i + 3]]); // NOLINT
187  rh_loc =
188  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 3]] << 6) | // NOLINT
189  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 2]] << 4) | // NOLINT
190  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i + 1]] << 2) | // NOLINT
191  (RC_CONVERT_TAB[(unsigned char)kmer_seq[i]]);
192  x = TETRAMER_TAB[fh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
193  d = k - i - 4; // NOLINT
194  fh_seed ^= d > 0 ? srol(x, d) : x;
195  x = TETRAMER_TAB[rh_loc]; // cppcheck-suppress arrayIndexOutOfBounds
196  d = i;
197  rh_seed ^= d > 0 ? srol(x, d) : x;
198  } break;
199  default: {
200  for (unsigned pos = block[0]; pos < block[1]; pos++) {
201  if (kmer_seq[pos] == SEED_N) {
202  loc_n = pos;
203  return false;
204  }
205  fh_seed ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
206  rh_seed ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
207  }
208  }
209  }
210  }
211  fh_nomonos[i_seed] = fh_seed;
212  rh_nomonos[i_seed] = rh_seed;
213  for (const auto& pos : seeds_monomers[i_seed]) {
214  fh_seed ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
215  rh_seed ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
216  }
217  fh_val[i_seed] = fh_seed;
218  rh_val[i_seed] = rh_seed;
219  i_base = i_seed * m2;
220  h_val[i_base] = canonical(fh_seed, rh_seed);
221  for (unsigned i_hash = 1; i_hash < m2; i_hash++) {
222  h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED);
223  h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT;
224  }
225  }
226  return true;
227 }
228 
229 #define NTMSM64(ROL_HANDLING, IN_HANDLING, OUT_HANDLING, ROR_HANDLING) \
230  unsigned char char_out, char_in; \
231  uint64_t fh_seed, rh_seed; \
232  unsigned i_out, i_in, i_base; \
233  for (unsigned i_seed = 0; i_seed < m; i_seed++) { \
234  ROL_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
235  for (const auto& block : seeds_blocks[i_seed]) \
236  { \
237  IN_HANDLING \
238  OUT_HANDLING \
239  fh_seed ^= srol_table(char_out, k - i_out); \
240  fh_seed ^= srol_table(char_in, k - i_in); \
241  rh_seed ^= srol_table(char_out & CP_OFF, i_out); \
242  rh_seed ^= srol_table(char_in & CP_OFF, i_in); \
243  } \
244  ROR_HANDLING /* NOLINT(bugprone-macro-parentheses) */ \
245  fh_nomonos[i_seed] = fh_seed; \
246  rh_nomonos[i_seed] = rh_seed; \
247  for (const auto& pos : seeds_monomers[i_seed]) { \
248  fh_seed ^= srol_table((unsigned char)kmer_seq[pos + 1], k - 1 - pos); \
249  rh_seed ^= srol_table((unsigned char)kmer_seq[pos + 1] & CP_OFF, pos); \
250  } \
251  fh_val[i_seed] = fh_seed; \
252  rh_val[i_seed] = rh_seed; \
253  i_base = i_seed * m2; \
254  h_val[i_base] = canonical(fh_seed, rh_seed); \
255  for (unsigned i_hash = 1; i_hash < m2; i_hash++) { \
256  h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED); \
257  h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT; \
258  } \
259  }
260 
282 inline void
283 ntmsm64(const char* kmer_seq,
284  const std::vector<SpacedSeedBlocks>& seeds_blocks,
285  const std::vector<SpacedSeedMonomers>& seeds_monomers,
286  unsigned k,
287  unsigned m,
288  unsigned m2,
289  uint64_t* fh_nomonos,
290  uint64_t* rh_nomonos,
291  uint64_t* fh_val,
292  uint64_t* rh_val,
293  uint64_t* h_val)
294 {
295  NTMSM64(fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
296  , i_in = block[1];
297  char_in = (unsigned char)kmer_seq[i_in];
298  , i_out = block[0];
299  char_out = (unsigned char)kmer_seq[i_out];
300  , rh_seed = sror(rh_seed);)
301 }
302 
303 inline void
304 ntmsm64(const std::deque<char>& kmer_seq,
305  const std::vector<SpacedSeedBlocks>& seeds_blocks,
306  const std::vector<SpacedSeedMonomers>& seeds_monomers,
307  unsigned k,
308  unsigned m,
309  unsigned m2,
310  uint64_t* fh_nomonos,
311  uint64_t* rh_nomonos,
312  uint64_t* fh_val,
313  uint64_t* rh_val,
314  uint64_t* h_val)
315 {
316  NTMSM64(fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
317  , i_in = block[1];
318  char_in = (unsigned char)kmer_seq[i_in];
319  , i_out = block[0];
320  char_out = (unsigned char)kmer_seq[i_out];
321  , rh_seed = sror(rh_seed);)
322 }
323 
345 inline void
346 ntmsm64l(const char* kmer_seq,
347  const std::vector<SpacedSeedBlocks>& seeds_blocks,
348  const std::vector<SpacedSeedMonomers>& seeds_monomers,
349  unsigned k,
350  unsigned m,
351  unsigned m2,
352  uint64_t* fh_nomonos,
353  uint64_t* rh_nomonos,
354  uint64_t* fh_val,
355  uint64_t* rh_val,
356  uint64_t* h_val)
357 {
358  NTMSM64(fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
359  , i_in = block[0];
360  char_in = (unsigned char)kmer_seq[i_in];
361  , i_out = block[1];
362  char_out = (unsigned char)kmer_seq[i_out];
363  , fh_seed = sror(fh_seed);)
364 }
365 
366 inline void
367 ntmsm64l(const std::deque<char>& kmer_seq,
368  const std::vector<SpacedSeedBlocks>& seeds_blocks,
369  const std::vector<SpacedSeedMonomers>& seeds_monomers,
370  unsigned k,
371  unsigned m,
372  unsigned m2,
373  uint64_t* fh_nomonos,
374  uint64_t* rh_nomonos,
375  uint64_t* fh_val,
376  uint64_t* rh_val,
377  uint64_t* h_val)
378 {
379  NTMSM64(fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
380  , i_in = block[0];
381  char_in = (unsigned char)kmer_seq[i_in];
382  , i_out = block[1];
383  char_out = (unsigned char)kmer_seq[i_out];
384  , fh_seed = sror(fh_seed);)
385 }
386 
408 inline void
409 ntmsm64(const char* kmer_seq,
410  char in,
411  const std::vector<SpacedSeedBlocks>& seeds_blocks,
412  const std::vector<SpacedSeedMonomers>& seeds_monomers,
413  unsigned k,
414  unsigned m,
415  unsigned m2,
416  uint64_t* fh_nomonos,
417  uint64_t* rh_nomonos,
418  uint64_t* fh_val,
419  uint64_t* rh_val,
420  uint64_t* h_val)
421 {
422  NTMSM64(
423  fh_seed = srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
424  , i_in = block[1];
425  if (i_in > k - 1) { char_in = in; } else {
426  char_in = (unsigned char)kmer_seq[i_in];
427  },
428  i_out = block[0];
429  char_out = (unsigned char)kmer_seq[i_out];
430  , rh_seed = sror(rh_seed);)
431 }
432 
454 inline void
455 ntmsm64l(const char* kmer_seq,
456  char in,
457  const std::vector<SpacedSeedBlocks>& seeds_blocks,
458  const std::vector<SpacedSeedMonomers>& seeds_monomers,
459  unsigned k,
460  unsigned m,
461  unsigned m2,
462  uint64_t* fh_nomonos,
463  uint64_t* rh_nomonos,
464  uint64_t* fh_val,
465  uint64_t* rh_val,
466  uint64_t* h_val)
467 {
468  NTMSM64(
469  fh_seed = fh_nomonos[i_seed]; rh_seed = srol(rh_nomonos[i_seed]);
470  , i_in = block[0];
471  if (i_in > k - 1) { char_in = in; } else {
472  char_in = (unsigned char)kmer_seq[i_in];
473  },
474  i_out = block[1];
475  char_out = (unsigned char)kmer_seq[i_out];
476  , fh_seed = sror(fh_seed);)
477 }
478 
479 } // namespace btllib::hashing_internals
480 
481 namespace btllib {
482 
483 using hashing_internals::check_seeds;
484 using hashing_internals::get_blocks;
485 using hashing_internals::ntmsm64;
486 using hashing_internals::ntmsm64l;
487 using hashing_internals::parsed_seeds_to_blocks;
488 using hashing_internals::SEED_N;
489 using hashing_internals::SEED_TAB;
490 
495 inline std::vector<std::vector<unsigned>>
496 parse_seeds(const std::vector<std::string>& seed_strings)
497 {
498  std::vector<std::vector<unsigned>> seed_set;
499  for (const auto& seed_string : seed_strings) {
500  std::vector<unsigned> seed;
501  size_t pos = 0;
502  for (const auto& c : seed_string) {
503  if (c != '1') {
504  seed.push_back(pos);
505  }
506  ++pos;
507  }
508  seed_set.push_back(seed);
509  }
510  return seed_set;
511 }
512 
517 {
518 
519 public:
530  SeedNtHash(const char* seq,
531  size_t seq_len,
532  const std::vector<std::string>& seeds,
533  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
534  hashing_internals::K_TYPE k,
535  size_t pos = 0)
536  : seq(seq)
537  , seq_len(seq_len)
538  , num_hashes_per_seed(num_hashes_per_seed)
539  , k(k)
540  , pos(pos)
541  , initialized(false)
542  , fwd_hash_nomonos(new uint64_t[seeds.size()])
543  , rev_hash_nomonos(new uint64_t[seeds.size()])
544  , fwd_hash(new uint64_t[seeds.size()])
545  , rev_hash(new uint64_t[seeds.size()])
546  , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
547  {
548  check_seeds(seeds, k);
549  check_error(seeds[0].size() != k,
550  "SeedNtHash: k should be equal to seed string lengths");
551  get_blocks(seeds, blocks, monomers);
552  }
553 
563  SeedNtHash(const std::string& seq,
564  const std::vector<std::string>& seeds,
565  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
566  hashing_internals::K_TYPE k,
567  size_t pos = 0)
568  : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
569  {
570  }
571 
582  SeedNtHash(const char* seq,
583  size_t seq_len,
584  const std::vector<std::vector<unsigned>>& seeds,
585  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
586  hashing_internals::K_TYPE k,
587  size_t pos = 0)
588  : seq(seq)
589  , seq_len(seq_len)
590  , num_hashes_per_seed(num_hashes_per_seed)
591  , k(k)
592  , pos(pos)
593  , initialized(false)
594  , fwd_hash_nomonos(new uint64_t[seeds.size()])
595  , rev_hash_nomonos(new uint64_t[seeds.size()])
596  , fwd_hash(new uint64_t[seeds.size()])
597  , rev_hash(new uint64_t[seeds.size()])
598  , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
599  {
600  parsed_seeds_to_blocks(seeds, k, blocks, monomers);
601  }
602 
612  SeedNtHash(const std::string& seq,
613  const std::vector<std::vector<unsigned>>& seeds,
614  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
615  hashing_internals::K_TYPE k,
616  size_t pos = 0)
617  : SeedNtHash(seq.data(), seq.size(), seeds, num_hashes_per_seed, k, pos)
618  {
619  }
620 
621  SeedNtHash(const SeedNtHash& obj)
622  : seq(obj.seq)
623  , seq_len(obj.seq_len)
624  , num_hashes_per_seed(obj.num_hashes_per_seed)
625  , k(obj.k)
626  , pos(obj.pos)
627  , initialized(obj.initialized)
628  , blocks(obj.blocks)
629  , monomers(obj.monomers)
630  , fwd_hash_nomonos(new uint64_t[obj.blocks.size()])
631  , rev_hash_nomonos(new uint64_t[obj.blocks.size()])
632  , fwd_hash(new uint64_t[obj.blocks.size()])
633  , rev_hash(new uint64_t[obj.blocks.size()])
634  , hash_arr(new uint64_t[obj.num_hashes_per_seed * obj.blocks.size()])
635  {
636  std::memcpy(fwd_hash_nomonos.get(),
637  obj.fwd_hash_nomonos.get(),
638  obj.blocks.size() * sizeof(uint64_t));
639  std::memcpy(rev_hash_nomonos.get(),
640  obj.rev_hash_nomonos.get(),
641  obj.blocks.size() * sizeof(uint64_t));
642  std::memcpy(
643  fwd_hash.get(), obj.fwd_hash.get(), obj.blocks.size() * sizeof(uint64_t));
644  std::memcpy(
645  rev_hash.get(), obj.rev_hash.get(), obj.blocks.size() * sizeof(uint64_t));
646  std::memcpy(hash_arr.get(),
647  obj.hash_arr.get(),
648  obj.num_hashes_per_seed * obj.blocks.size() * sizeof(uint64_t));
649  }
650 
651  SeedNtHash(SeedNtHash&&) = default;
652 
658  bool roll()
659  {
660  if (!initialized) {
661  return init();
662  }
663  if (pos >= seq_len - k) {
664  return false;
665  }
666  if (SEED_TAB[(unsigned char)seq[pos + k]] == SEED_N) {
667  pos += k;
668  return init();
669  }
670  ntmsm64(seq + pos,
671  blocks,
672  monomers,
673  k,
674  blocks.size(),
675  num_hashes_per_seed,
676  fwd_hash_nomonos.get(),
677  rev_hash_nomonos.get(),
678  fwd_hash.get(),
679  rev_hash.get(),
680  hash_arr.get());
681  ++pos;
682  return true;
683  }
684 
689  bool roll_back()
690  {
691  if (!initialized) {
692  return init();
693  }
694  if (pos == 0) {
695  return false;
696  }
697  if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N && pos >= k) {
698  pos -= k;
699  return init();
700  }
701  if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N) {
702  return false;
703  }
704  ntmsm64l(seq + pos - 1,
705  blocks,
706  monomers,
707  k,
708  blocks.size(),
709  num_hashes_per_seed,
710  fwd_hash_nomonos.get(),
711  rev_hash_nomonos.get(),
712  fwd_hash.get(),
713  rev_hash.get(),
714  hash_arr.get());
715  --pos;
716  return true;
717  }
718 
724  bool peek()
725  {
726  if (pos >= seq_len - k) {
727  return false;
728  }
729  return peek(seq[pos + k]);
730  }
731 
736  bool peek_back()
737  {
738  if (pos == 0) {
739  return false;
740  }
741  return peek_back(seq[pos - 1]);
742  }
743 
748  bool peek(char char_in)
749  {
750  if (!initialized) {
751  return init();
752  }
753  const std::unique_ptr<uint64_t[]> fwd_hash_nomonos_cpy(
754  new uint64_t[blocks.size()]);
755  const std::unique_ptr<uint64_t[]> rev_hash_nomonos_cpy(
756  new uint64_t[blocks.size()]);
757  const std::unique_ptr<uint64_t[]> fwd_hash_cpy(new uint64_t[blocks.size()]);
758  const std::unique_ptr<uint64_t[]> rev_hash_cpy(new uint64_t[blocks.size()]);
759  std::memcpy(fwd_hash_nomonos_cpy.get(),
760  fwd_hash_nomonos.get(),
761  blocks.size() * sizeof(uint64_t));
762  std::memcpy(rev_hash_nomonos_cpy.get(),
763  rev_hash_nomonos.get(),
764  blocks.size() * sizeof(uint64_t));
765  std::memcpy(
766  fwd_hash_cpy.get(), fwd_hash.get(), blocks.size() * sizeof(uint64_t));
767  std::memcpy(
768  rev_hash_cpy.get(), rev_hash.get(), blocks.size() * sizeof(uint64_t));
769  ntmsm64(seq + pos,
770  char_in,
771  blocks,
772  monomers,
773  k,
774  blocks.size(),
775  num_hashes_per_seed,
776  fwd_hash_nomonos_cpy.get(),
777  rev_hash_nomonos_cpy.get(),
778  fwd_hash_cpy.get(),
779  rev_hash_cpy.get(),
780  hash_arr.get());
781  return true;
782  }
783 
788  bool peek_back(char char_in)
789  {
790  if (!initialized) {
791  return init();
792  }
793  const std::unique_ptr<uint64_t[]> fwd_hash_nomonos_cpy(
794  new uint64_t[blocks.size()]);
795  const std::unique_ptr<uint64_t[]> rev_hash_nomonos_cpy(
796  new uint64_t[blocks.size()]);
797  const std::unique_ptr<uint64_t[]> fwd_hash_cpy(new uint64_t[blocks.size()]);
798  const std::unique_ptr<uint64_t[]> rev_hash_cpy(new uint64_t[blocks.size()]);
799  std::memcpy(fwd_hash_nomonos_cpy.get(),
800  fwd_hash_nomonos.get(),
801  blocks.size() * sizeof(uint64_t));
802  std::memcpy(rev_hash_nomonos_cpy.get(),
803  rev_hash_nomonos.get(),
804  blocks.size() * sizeof(uint64_t));
805  std::memcpy(
806  fwd_hash_cpy.get(), fwd_hash.get(), blocks.size() * sizeof(uint64_t));
807  std::memcpy(
808  rev_hash_cpy.get(), rev_hash.get(), blocks.size() * sizeof(uint64_t));
809  ntmsm64l(seq + pos - 1,
810  char_in,
811  blocks,
812  monomers,
813  k,
814  blocks.size(),
815  num_hashes_per_seed,
816  fwd_hash_nomonos_cpy.get(),
817  rev_hash_nomonos_cpy.get(),
818  fwd_hash_cpy.get(),
819  rev_hash_cpy.get(),
820  hash_arr.get());
821  return true;
822  }
823 
828  const uint64_t* hashes() const { return hash_arr.get(); }
829 
835  size_t get_pos() const { return pos; }
836 
841  unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
842 
847  hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
848  {
849  return num_hashes_per_seed;
850  }
851 
856  hashing_internals::K_TYPE get_k() const { return k; }
857 
862  uint64_t* get_forward_hash() const { return fwd_hash.get(); }
863 
868  uint64_t* get_reverse_hash() const { return rev_hash.get(); }
869 
870 private:
871  const char* seq;
872  const size_t seq_len;
873  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed;
874  hashing_internals::K_TYPE k;
875  size_t pos;
876  bool initialized;
877  std::vector<hashing_internals::SpacedSeedBlocks> blocks;
878  std::vector<hashing_internals::SpacedSeedMonomers> monomers;
879  std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
880  std::unique_ptr<uint64_t[]> rev_hash_nomonos;
881  std::unique_ptr<uint64_t[]> fwd_hash;
882  std::unique_ptr<uint64_t[]> rev_hash;
883  std::unique_ptr<uint64_t[]> hash_arr;
884 
889  bool init()
890  {
891  unsigned pos_n = 0;
892  while (pos < seq_len - k + 1 && !ntmsm64(seq + pos,
893  blocks,
894  monomers,
895  k,
896  blocks.size(),
897  num_hashes_per_seed,
898  fwd_hash_nomonos.get(),
899  rev_hash_nomonos.get(),
900  fwd_hash.get(),
901  rev_hash.get(),
902  pos_n,
903  hash_arr.get())) {
904  pos += pos_n + 1;
905  }
906  if (pos > seq_len - k) {
907  return false;
908  }
909  initialized = true;
910  return true;
911  }
912 };
913 
919 {
920 
921 public:
932  BlindSeedNtHash(const char* seq,
933  const std::vector<std::string>& seeds,
934  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed,
935  hashing_internals::K_TYPE k,
936  long pos = 0)
937  : seq(seq + pos, seq + pos + k)
938  , num_hashes_per_seed(num_hashes_per_seed)
939  , k(k)
940  , pos(pos)
941  , fwd_hash_nomonos(new uint64_t[seeds.size()])
942  , rev_hash_nomonos(new uint64_t[seeds.size()])
943  , fwd_hash(new uint64_t[seeds.size()])
944  , rev_hash(new uint64_t[seeds.size()])
945  , hash_arr(new uint64_t[num_hashes_per_seed * seeds.size()])
946  {
947  check_seeds(seeds, k);
948  get_blocks(seeds, blocks, monomers);
949  unsigned pos_n = 0;
950  ntmsm64(seq + pos,
951  blocks,
952  monomers,
953  k,
954  blocks.size(),
955  num_hashes_per_seed,
956  fwd_hash_nomonos.get(),
957  rev_hash_nomonos.get(),
958  fwd_hash.get(),
959  rev_hash.get(),
960  pos_n,
961  hash_arr.get());
962  }
963 
964  BlindSeedNtHash(const BlindSeedNtHash& seed_nthash)
965  : seq(seed_nthash.seq)
966  , num_hashes_per_seed(seed_nthash.num_hashes_per_seed)
967  , k(seed_nthash.k)
968  , pos(seed_nthash.pos)
969  , blocks(seed_nthash.blocks)
970  , monomers(seed_nthash.monomers)
971  , fwd_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
972  , rev_hash_nomonos(new uint64_t[seed_nthash.blocks.size()])
973  , fwd_hash(new uint64_t[seed_nthash.blocks.size()])
974  , rev_hash(new uint64_t[seed_nthash.blocks.size()])
975  , hash_arr(new uint64_t[num_hashes_per_seed * seed_nthash.blocks.size()])
976  {
977  std::memcpy(fwd_hash_nomonos.get(),
978  seed_nthash.fwd_hash_nomonos.get(),
979  seed_nthash.blocks.size() * sizeof(uint64_t));
980  std::memcpy(rev_hash_nomonos.get(),
981  seed_nthash.rev_hash_nomonos.get(),
982  seed_nthash.blocks.size() * sizeof(uint64_t));
983  std::memcpy(fwd_hash.get(),
984  seed_nthash.fwd_hash.get(),
985  seed_nthash.blocks.size() * sizeof(uint64_t));
986  std::memcpy(rev_hash.get(),
987  seed_nthash.rev_hash.get(),
988  seed_nthash.blocks.size() * sizeof(uint64_t));
989  std::memcpy(hash_arr.get(),
990  seed_nthash.hash_arr.get(),
991  num_hashes_per_seed * seed_nthash.blocks.size() *
992  sizeof(uint64_t));
993  }
994 
995  BlindSeedNtHash(BlindSeedNtHash&&) = default;
996 
1002  void roll(char char_in)
1003  {
1004  seq.push_back(char_in);
1005  ntmsm64(seq,
1006  blocks,
1007  monomers,
1008  k,
1009  blocks.size(),
1010  num_hashes_per_seed,
1011  fwd_hash_nomonos.get(),
1012  rev_hash_nomonos.get(),
1013  fwd_hash.get(),
1014  rev_hash.get(),
1015  hash_arr.get());
1016  seq.pop_front();
1017  ++pos;
1018  }
1019 
1023  void roll_back(char char_in)
1024  {
1025  seq.push_front(char_in);
1026  ntmsm64l(seq,
1027  blocks,
1028  monomers,
1029  k,
1030  blocks.size(),
1031  num_hashes_per_seed,
1032  fwd_hash_nomonos.get(),
1033  rev_hash_nomonos.get(),
1034  fwd_hash.get(),
1035  rev_hash.get(),
1036  hash_arr.get());
1037  seq.pop_back();
1038  --pos;
1039  }
1040 
1045  const uint64_t* hashes() const { return hash_arr.get(); }
1046 
1052  long get_pos() const { return pos; }
1053 
1058  unsigned get_hash_num() const { return num_hashes_per_seed * blocks.size(); }
1059 
1064  hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
1065  {
1066  return num_hashes_per_seed;
1067  }
1068 
1073  hashing_internals::K_TYPE get_k() const { return k; }
1074 
1079  uint64_t* get_forward_hash() const { return fwd_hash.get(); }
1080 
1085  uint64_t* get_reverse_hash() const { return rev_hash.get(); }
1086 
1087 private:
1088  std::deque<char> seq;
1089  hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed;
1090  hashing_internals::K_TYPE k;
1091  long pos;
1092  std::vector<hashing_internals::SpacedSeedBlocks> blocks;
1093  std::vector<hashing_internals::SpacedSeedMonomers> monomers;
1094  std::unique_ptr<uint64_t[]> fwd_hash_nomonos;
1095  std::unique_ptr<uint64_t[]> rev_hash_nomonos;
1096  std::unique_ptr<uint64_t[]> fwd_hash;
1097  std::unique_ptr<uint64_t[]> rev_hash;
1098  std::unique_ptr<uint64_t[]> hash_arr;
1099 };
1100 
1101 } // namespace btllib
unsigned get_hash_num() const
Definition: nthash_seed.hpp:1058
uint64_t * get_reverse_hash() const
Definition: nthash_seed.hpp:1085
bool peek(char char_in)
Definition: nthash_seed.hpp:748
uint64_t * get_forward_hash() const
Definition: nthash_seed.hpp:1079
SeedNtHash(const std::string &seq, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_seed.hpp:563
std::vector< std::vector< unsigned > > parse_seeds(const std::vector< std::string > &seed_strings)
Definition: nthash_seed.hpp:496
uint64_t * get_forward_hash() const
Definition: nthash_seed.hpp:862
BlindSeedNtHash(const char *seq, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, long pos=0)
Definition: nthash_seed.hpp:932
long get_pos() const
Definition: nthash_seed.hpp:1052
hashing_internals::K_TYPE get_k() const
Definition: nthash_seed.hpp:1073
void check_warning(bool condition, const std::string &msg)
SeedNtHash(const std::string &seq, const std::vector< std::vector< unsigned >> &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_seed.hpp:612
const uint64_t * hashes() const
Definition: nthash_seed.hpp:1045
bool roll_back()
Definition: nthash_seed.hpp:689
void roll(char char_in)
Definition: nthash_seed.hpp:1002
uint64_t * get_reverse_hash() const
Definition: nthash_seed.hpp:868
void check_error(bool condition, const std::string &msg)
bool peek_back()
Definition: nthash_seed.hpp:736
const uint64_t * hashes() const
Definition: nthash_seed.hpp:828
hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition: nthash_seed.hpp:847
hashing_internals::NUM_HASHES_TYPE get_hash_num_per_seed() const
Definition: nthash_seed.hpp:1064
Definition: nthash_seed.hpp:918
size_t get_pos() const
Definition: nthash_seed.hpp:835
Definition: nthash_seed.hpp:516
unsigned get_hash_num() const
Definition: nthash_seed.hpp:841
bool roll()
Definition: nthash_seed.hpp:658
bool peek()
Definition: nthash_seed.hpp:724
void roll_back(char char_in)
Definition: nthash_seed.hpp:1023
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::vector< unsigned >> &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_seed.hpp:582
SeedNtHash(const char *seq, size_t seq_len, const std::vector< std::string > &seeds, hashing_internals::NUM_HASHES_TYPE num_hashes_per_seed, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_seed.hpp:530
bool peek_back(char char_in)
Definition: nthash_seed.hpp:788
hashing_internals::K_TYPE get_k() const
Definition: nthash_seed.hpp:856