btllib
 All Classes Namespaces Functions Variables
nthash_kmer.hpp
1 #pragma once
2 
3 #include <array>
4 #include <cstdint>
5 #include <cstring>
6 #include <deque>
7 #include <memory>
8 #include <string>
9 #include <vector>
10 
11 #include <btllib/hashing_internals.hpp>
12 #include <btllib/status.hpp>
13 
14 namespace btllib::hashing_internals {
15 
22 inline uint64_t
23 base_forward_hash(const char* seq, unsigned k)
24 {
25  uint64_t h_val = 0;
26  for (unsigned i = 0; i < k - 3; i += 4) {
27  h_val = srol(h_val, 4);
28  uint8_t loc = 0;
29  loc += 64 * CONVERT_TAB[(unsigned char)seq[i]]; // NOLINT
30  loc += 16 * CONVERT_TAB[(unsigned char)seq[i + 1]]; // NOLINT
31  loc += 4 * CONVERT_TAB[(unsigned char)seq[i + 2]];
32  loc += CONVERT_TAB[(unsigned char)seq[i + 3]];
33  h_val ^= TETRAMER_TAB[loc];
34  }
35  const unsigned remainder = k % 4;
36  if (remainder > 0) {
37  h_val = srol(h_val, remainder);
38  }
39  if (remainder == 3) {
40  uint8_t trimer_loc = 0;
41  trimer_loc += 16 * CONVERT_TAB[(unsigned char)seq[k - 3]]; // NOLINT
42  trimer_loc += 4 * CONVERT_TAB[(unsigned char)seq[k - 2]];
43  trimer_loc += CONVERT_TAB[(unsigned char)seq[k - 1]];
44  h_val ^= TRIMER_TAB[trimer_loc];
45  } else if (remainder == 2) {
46  uint8_t dimer_loc = 0;
47  dimer_loc += 4 * CONVERT_TAB[(unsigned char)seq[k - 2]];
48  dimer_loc += CONVERT_TAB[(unsigned char)seq[k - 1]];
49  h_val ^= DIMER_TAB[dimer_loc];
50  } else if (remainder == 1) {
51  h_val ^= SEED_TAB[(unsigned char)seq[k - 1]];
52  }
53  return h_val;
54 }
55 
65 inline uint64_t
66 next_forward_hash(uint64_t fh_val,
67  unsigned k,
68  unsigned char char_out,
69  unsigned char char_in)
70 {
71  uint64_t h_val = srol(fh_val);
72  h_val ^= SEED_TAB[char_in];
73  h_val ^= srol_table(char_out, k);
74  return h_val;
75 }
76 
85 inline uint64_t
86 prev_forward_hash(uint64_t fh_val,
87  unsigned k,
88  unsigned char char_out,
89  unsigned char char_in)
90 {
91  uint64_t h_val = fh_val ^ srol_table(char_in, k);
92  h_val ^= SEED_TAB[char_out];
93  h_val = sror(h_val);
94  return h_val;
95 }
96 
104 inline uint64_t
105 base_reverse_hash(const char* seq, unsigned k)
106 {
107  uint64_t h_val = 0;
108  const unsigned remainder = k % 4;
109  if (remainder == 3) {
110  uint8_t trimer_loc = 0;
111  trimer_loc += 16 * RC_CONVERT_TAB[(unsigned char)seq[k - 1]]; // NOLINT
112  trimer_loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[k - 2]];
113  trimer_loc += RC_CONVERT_TAB[(unsigned char)seq[k - 3]];
114  h_val ^= TRIMER_TAB[trimer_loc];
115  } else if (remainder == 2) {
116  uint8_t dimer_loc = 0;
117  dimer_loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[k - 1]];
118  dimer_loc += RC_CONVERT_TAB[(unsigned char)seq[k - 2]];
119  h_val ^= DIMER_TAB[dimer_loc];
120  } else if (remainder == 1) {
121  h_val ^= SEED_TAB[(unsigned char)seq[k - 1] & CP_OFF];
122  }
123  for (int i = (int)(k - remainder) - 1; i >= 3; i -= 4) {
124  h_val = srol(h_val, 4);
125  uint8_t loc = 0;
126  loc += 64 * RC_CONVERT_TAB[(unsigned char)seq[i]]; // NOLINT
127  loc += 16 * RC_CONVERT_TAB[(unsigned char)seq[i - 1]]; // NOLINT
128  loc += 4 * RC_CONVERT_TAB[(unsigned char)seq[i - 2]];
129  loc += RC_CONVERT_TAB[(unsigned char)seq[i - 3]];
130  h_val ^= TETRAMER_TAB[loc];
131  }
132  return h_val;
133 }
134 
145 inline uint64_t
146 next_reverse_hash(uint64_t rh_val,
147  unsigned k,
148  unsigned char char_out,
149  unsigned char char_in)
150 {
151  uint64_t h_val = rh_val ^ srol_table(char_in & CP_OFF, k);
152  h_val ^= SEED_TAB[char_out & CP_OFF];
153  h_val = sror(h_val);
154  return h_val;
155 }
156 
165 inline uint64_t
166 prev_reverse_hash(uint64_t rh_val,
167  unsigned k,
168  unsigned char char_out,
169  unsigned char char_in)
170 {
171  uint64_t h_val = srol(rh_val);
172  h_val ^= SEED_TAB[char_in & CP_OFF];
173  h_val ^= srol_table(char_out & CP_OFF, k);
174  return h_val;
175 }
176 
191 inline void
192 sub_hash(uint64_t fh_val,
193  uint64_t rh_val,
194  const char* kmer_seq,
195  const std::vector<unsigned>& positions,
196  const std::vector<unsigned char>& new_bases,
197  unsigned k,
198  unsigned m,
199  uint64_t* h_val)
200 {
201  uint64_t b_val = 0;
202 
203  for (size_t i = 0; i < positions.size(); i++) {
204  const auto pos = positions[i];
205  const auto new_base = new_bases[i];
206 
207  fh_val ^= srol_table((unsigned char)kmer_seq[pos], k - 1 - pos);
208  fh_val ^= srol_table(new_base, k - 1 - pos);
209 
210  rh_val ^= srol_table((unsigned char)kmer_seq[pos] & CP_OFF, pos);
211  rh_val ^= srol_table(new_base & CP_OFF, pos);
212  }
213 
214  b_val = canonical(fh_val, rh_val);
215  extend_hashes(b_val, k, m, h_val);
216 }
217 
218 } // namespace btllib::hashing_internals
219 
220 namespace btllib {
221 
222 using hashing_internals::base_forward_hash;
223 using hashing_internals::base_reverse_hash;
224 using hashing_internals::extend_hashes;
225 using hashing_internals::next_forward_hash;
226 using hashing_internals::next_reverse_hash;
227 using hashing_internals::prev_forward_hash;
228 using hashing_internals::prev_reverse_hash;
229 using hashing_internals::SEED_N;
230 using hashing_internals::SEED_TAB;
231 using hashing_internals::sub_hash;
232 
236 class NtHash
237 {
238 
239 public:
248  NtHash(const char* seq,
249  size_t seq_len,
250  hashing_internals::NUM_HASHES_TYPE num_hashes,
251  hashing_internals::K_TYPE k,
252  size_t pos = 0)
253  : seq(seq)
254  , seq_len(seq_len)
255  , num_hashes(num_hashes)
256  , k(k)
257  , pos(pos)
258  , initialized(false)
259  , hash_arr(new uint64_t[num_hashes])
260  {
261  check_error(k == 0, "NtHash: k must be greater than 0");
262  check_error(this->seq_len < k,
263  "NtHash: sequence length (" + std::to_string(this->seq_len) +
264  ") is smaller than k (" + std::to_string(k) + ")");
265  check_error(pos > this->seq_len - k,
266  "NtHash: passed position (" + std::to_string(pos) +
267  ") is larger than sequence length (" +
268  std::to_string(this->seq_len) + ")");
269  }
270 
278  NtHash(const std::string& seq,
279  hashing_internals::NUM_HASHES_TYPE num_hashes,
280  hashing_internals::K_TYPE k,
281  size_t pos = 0)
282  : NtHash(seq.data(), seq.size(), num_hashes, k, pos)
283  {
284  }
285 
286  NtHash(const NtHash& obj)
287  : seq(obj.seq)
288  , seq_len(obj.seq_len)
289  , num_hashes(obj.num_hashes)
290  , k(obj.k)
291  , pos(obj.pos)
292  , initialized(obj.initialized)
293  , fwd_hash(obj.fwd_hash)
294  , rev_hash(obj.rev_hash)
295  , hash_arr(new uint64_t[obj.num_hashes])
296  {
297  std::memcpy(
298  hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
299  }
300 
301  NtHash(NtHash&&) = default;
302 
315  bool roll()
316  {
317  if (!initialized) {
318  return init();
319  }
320  if (pos >= seq_len - k) {
321  return false;
322  }
323  if (hashing_internals::SEED_TAB[(unsigned char)seq[pos + k]] ==
324  hashing_internals::SEED_N) {
325  pos += k;
326  return init();
327  }
328  fwd_hash = next_forward_hash(fwd_hash, k, seq[pos], seq[pos + k]);
329  rev_hash = next_reverse_hash(rev_hash, k, seq[pos], seq[pos + k]);
330  extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
331  ++pos;
332  return true;
333  }
334 
339  bool roll_back()
340  {
341  if (!initialized) {
342  return init();
343  }
344  if (pos == 0) {
345  return false;
346  }
347  if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N && pos >= k) {
348  pos -= k;
349  return init();
350  }
351  if (SEED_TAB[(unsigned char)seq[pos - 1]] == SEED_N) {
352  return false;
353  }
354  fwd_hash = prev_forward_hash(fwd_hash, k, seq[pos + k - 1], seq[pos - 1]);
355  rev_hash = prev_reverse_hash(rev_hash, k, seq[pos + k - 1], seq[pos - 1]);
356  extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
357  --pos;
358  return true;
359  }
360 
367  bool peek()
368  {
369  if (pos >= seq_len - k) {
370  return false;
371  }
372  return peek(seq[pos + k]);
373  }
374 
379  bool peek_back()
380  {
381  if (pos == 0) {
382  return false;
383  }
384  return peek_back(seq[pos - 1]);
385  }
386 
393  bool peek(char char_in)
394  {
395  if (!initialized) {
396  return init();
397  }
398  if (SEED_TAB[(unsigned char)char_in] == SEED_N) {
399  return false;
400  }
401  const uint64_t fwd = next_forward_hash(fwd_hash, k, seq[pos], char_in);
402  const uint64_t rev = next_reverse_hash(rev_hash, k, seq[pos], char_in);
403  extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
404  return true;
405  }
406 
411  bool peek_back(char char_in)
412  {
413  if (!initialized) {
414  return init();
415  }
416  if (SEED_TAB[(unsigned char)char_in] == SEED_N) {
417  return false;
418  }
419  const unsigned char char_out = seq[pos + k - 1];
420  const uint64_t fwd = prev_forward_hash(fwd_hash, k, char_out, char_in);
421  const uint64_t rev = prev_reverse_hash(rev_hash, k, char_out, char_in);
422  extend_hashes(fwd, rev, k, num_hashes, hash_arr.get());
423  return true;
424  }
425 
426  void sub(const std::vector<unsigned>& positions,
427  const std::vector<unsigned char>& new_bases)
428  {
429  sub_hash(fwd_hash,
430  rev_hash,
431  seq + pos,
432  positions,
433  new_bases,
434  get_k(),
435  get_hash_num(),
436  hash_arr.get());
437  }
438 
443  const uint64_t* hashes() const { return hash_arr.get(); }
444 
450  size_t get_pos() const { return pos; }
451 
456  hashing_internals::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
457 
462  hashing_internals::K_TYPE get_k() const { return k; }
463 
468  uint64_t get_forward_hash() const { return fwd_hash; }
469 
474  uint64_t get_reverse_hash() const { return rev_hash; }
475 
476 private:
477  const char* seq;
478  const size_t seq_len;
479  hashing_internals::NUM_HASHES_TYPE num_hashes;
480  hashing_internals::K_TYPE k;
481  size_t pos;
482  bool initialized;
483  uint64_t fwd_hash = 0;
484  uint64_t rev_hash = 0;
485  std::unique_ptr<uint64_t[]> hash_arr;
486 
491  bool init()
492  {
493  bool has_n = true;
494  while (pos <= seq_len - k + 1 && has_n) {
495  has_n = false;
496  for (unsigned i = 0; i < k && pos <= seq_len - k + 1; i++) {
497  if (SEED_TAB[(unsigned char)seq[pos + k - i - 1]] == SEED_N) {
498  pos += k - i;
499  has_n = true;
500  }
501  }
502  }
503  if (pos > seq_len - k) {
504  return false;
505  }
506  fwd_hash = base_forward_hash(seq + pos, k);
507  rev_hash = base_reverse_hash(seq + pos, k);
508  extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
509  initialized = true;
510  return true;
511  }
512 };
513 
521 {
522 
523 public:
532  BlindNtHash(const std::string& seq,
533  hashing_internals::NUM_HASHES_TYPE num_hashes,
534  hashing_internals::K_TYPE k,
535  long pos = 0)
536  : seq(seq.data() + pos, seq.data() + pos + k)
537  , num_hashes(num_hashes)
538  , pos(pos)
539  , fwd_hash(base_forward_hash(seq.data(), k))
540  , rev_hash(base_reverse_hash(seq.data(), k))
541  , hash_arr(new uint64_t[num_hashes])
542  {
543  check_error(k == 0, "BlindNtHash: k must be greater than 0");
544  extend_hashes(fwd_hash, rev_hash, k, num_hashes, hash_arr.get());
545  }
546 
547  BlindNtHash(const BlindNtHash& obj)
548  : seq(obj.seq)
549  , num_hashes(obj.num_hashes)
550  , pos(obj.pos)
551  , fwd_hash(obj.fwd_hash)
552  , rev_hash(obj.rev_hash)
553  , hash_arr(new uint64_t[obj.num_hashes])
554  {
555  std::memcpy(
556  hash_arr.get(), obj.hash_arr.get(), num_hashes * sizeof(uint64_t));
557  }
558 
559  BlindNtHash(BlindNtHash&&) = default;
560 
567  void roll(char char_in)
568  {
569  fwd_hash = next_forward_hash(fwd_hash, seq.size(), seq.front(), char_in);
570  rev_hash = next_reverse_hash(rev_hash, seq.size(), seq.front(), char_in);
571  extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
572  seq.pop_front();
573  seq.push_back(char_in);
574  ++pos;
575  }
576 
580  void roll_back(char char_in)
581  {
582  fwd_hash = prev_forward_hash(fwd_hash, seq.size(), seq.back(), char_in);
583  rev_hash = prev_reverse_hash(rev_hash, seq.size(), seq.back(), char_in);
584  extend_hashes(fwd_hash, rev_hash, seq.size(), num_hashes, hash_arr.get());
585  seq.pop_back();
586  seq.push_front(char_in);
587  --pos;
588  }
589 
593  void peek(char char_in)
594  {
595  const hashing_internals::K_TYPE k = seq.size();
596  const uint64_t fwd = next_forward_hash(fwd_hash, k, seq.front(), char_in);
597  const uint64_t rev = next_reverse_hash(rev_hash, k, seq.front(), char_in);
598  extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
599  }
600 
604  void peek_back(char char_in)
605  {
606  const hashing_internals::K_TYPE k = seq.size();
607  const uint64_t fwd = prev_forward_hash(fwd_hash, k, seq.back(), char_in);
608  const uint64_t rev = prev_reverse_hash(rev_hash, k, seq.back(), char_in);
609  extend_hashes(fwd, rev, seq.size(), num_hashes, hash_arr.get());
610  }
611 
616  const uint64_t* hashes() const { return hash_arr.get(); }
617 
623  long get_pos() const { return pos; }
624 
629  hashing_internals::NUM_HASHES_TYPE get_hash_num() const { return num_hashes; }
630 
635  hashing_internals::K_TYPE get_k() const { return seq.size(); }
636 
641  uint64_t get_forward_hash() const { return fwd_hash; }
642 
647  uint64_t get_reverse_hash() const { return rev_hash; }
648 
649 private:
650  std::deque<char> seq;
651  hashing_internals::NUM_HASHES_TYPE num_hashes;
652  long pos;
653  uint64_t fwd_hash;
654  uint64_t rev_hash;
655  std::unique_ptr<uint64_t[]> hash_arr;
656 };
657 
658 } // namespace btllib
void peek_back(char char_in)
Definition: nthash_kmer.hpp:604
bool peek()
Definition: nthash_kmer.hpp:367
uint64_t get_reverse_hash() const
Definition: nthash_kmer.hpp:474
bool peek(char char_in)
Definition: nthash_kmer.hpp:393
long get_pos() const
Definition: nthash_kmer.hpp:623
bool peek_back(char char_in)
Definition: nthash_kmer.hpp:411
NtHash(const char *seq, size_t seq_len, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_kmer.hpp:248
const uint64_t * hashes() const
Definition: nthash_kmer.hpp:443
uint64_t get_forward_hash() const
Definition: nthash_kmer.hpp:468
Definition: nthash_kmer.hpp:236
size_t get_pos() const
Definition: nthash_kmer.hpp:450
hashing_internals::K_TYPE get_k() const
Definition: nthash_kmer.hpp:462
BlindNtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, long pos=0)
Definition: nthash_kmer.hpp:532
uint64_t get_forward_hash() const
Definition: nthash_kmer.hpp:641
Definition: nthash_kmer.hpp:520
void check_error(bool condition, const std::string &msg)
bool peek_back()
Definition: nthash_kmer.hpp:379
hashing_internals::K_TYPE get_k() const
Definition: nthash_kmer.hpp:635
bool roll()
Definition: nthash_kmer.hpp:315
NtHash(const std::string &seq, hashing_internals::NUM_HASHES_TYPE num_hashes, hashing_internals::K_TYPE k, size_t pos=0)
Definition: nthash_kmer.hpp:278
bool roll_back()
Definition: nthash_kmer.hpp:339
void roll(char char_in)
Definition: nthash_kmer.hpp:567
const uint64_t * hashes() const
Definition: nthash_kmer.hpp:616
void peek(char char_in)
Definition: nthash_kmer.hpp:593
void roll_back(char char_in)
Definition: nthash_kmer.hpp:580
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition: nthash_kmer.hpp:629
hashing_internals::NUM_HASHES_TYPE get_hash_num() const
Definition: nthash_kmer.hpp:456
uint64_t get_reverse_hash() const
Definition: nthash_kmer.hpp:647