8#ifndef BTLLIB_NTHASH_LOWLEVEL_HPP
9#define BTLLIB_NTHASH_LOWLEVEL_HPP
11#include "btllib/nthash_consts.hpp"
25canonical(
const T fwd,
const T rev)
30static_assert(std::numeric_limits<unsigned>::max() + 1 == 0,
31 "Integers don't overflow on this platform which is necessary for "
32 "ntHash canonical hash computation.");
37using SpacedSeed = std::vector<unsigned>;
40using SpacedSeedBlocks = std::vector<std::array<unsigned, 2>>;
43using SpacedSeedMonomers = std::vector<unsigned>;
56 uint64_t m = ((x & 0x8000000000000000ULL) >> 30) |
57 ((x & 0x100000000ULL) >> 32);
58 return ((x << 1) & 0xFFFFFFFDFFFFFFFFULL) | m;
71srol(
const uint64_t x,
const unsigned d)
73 uint64_t v = (x << d) | (x >> (64 - d));
74 uint64_t y = (v ^ (v >> 33)) &
75 (std::numeric_limits<uint64_t>::max() >> (64 - d));
76 return v ^ (y | (y << 33));
90 uint64_t m = ((x & 0x200000000ULL) << 30) | ((x & 1ULL) << 32);
91 return ((x >> 1) & 0xFFFFFFFEFFFFFFFFULL) | m;
103ntf64(
const char* kmer_seq,
unsigned k)
106 for (
unsigned i = 0; i < k / 4; i++) {
107 h_val =
srol(h_val, 4);
108 const uint8_t curr_offset = 4 * i;
109 const uint8_t tetramer_loc =
110 64 * CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset]] +
111 16 * CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset + 1]] +
112 4 * CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset + 2]] +
113 CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset + 3]];
114 h_val ^= TETRAMER_TAB[tetramer_loc];
116 const unsigned remainder = k % 4;
117 h_val =
srol(h_val, remainder);
118 if (remainder == 3) {
119 const uint8_t trimer_loc =
120 16 * CONVERT_TAB[(
unsigned char)kmer_seq[k - 3]] +
121 4 * CONVERT_TAB[(
unsigned char)kmer_seq[k - 2]] +
122 CONVERT_TAB[(
unsigned char)kmer_seq[k - 1]];
123 h_val ^= TRIMER_TAB[trimer_loc];
124 }
else if (remainder == 2) {
125 const uint8_t dimer_loc = 4 * CONVERT_TAB[(
unsigned char)kmer_seq[k - 2]] +
126 CONVERT_TAB[(
unsigned char)kmer_seq[k - 1]];
127 h_val ^= DIMER_TAB[dimer_loc];
128 }
else if (remainder == 1) {
129 h_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1]];
144ntr64(
const char* kmer_seq,
unsigned k)
147 const unsigned remainder = k % 4;
148 if (remainder == 3) {
149 const uint8_t trimer_loc =
150 16 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[k - 1]] +
151 4 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[k - 2]] +
152 RC_CONVERT_TAB[(
unsigned char)kmer_seq[k - 3]];
153 h_val ^= TRIMER_TAB[trimer_loc];
154 }
else if (remainder == 2) {
155 const uint8_t dimer_loc =
156 4 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[k - 1]] +
157 RC_CONVERT_TAB[(
unsigned char)kmer_seq[k - 2]];
158 h_val ^= DIMER_TAB[dimer_loc];
159 }
else if (remainder == 1) {
160 h_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1] & CP_OFF];
162 for (
unsigned i = 0; i < k / 4; i++) {
163 h_val =
srol(h_val, 4);
164 const uint8_t curr_offset = 4 * (k / 4 - i) - 1;
165 const uint8_t tetramer_loc =
166 64 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset]] +
167 16 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset - 1]] +
168 4 * RC_CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset - 2]] +
169 RC_CONVERT_TAB[(
unsigned char)kmer_seq[curr_offset - 3]];
170 h_val ^= TETRAMER_TAB[tetramer_loc];
189 unsigned char char_out,
190 unsigned char char_in)
192 uint64_t h_val =
srol(fh_val);
193 h_val ^= SEED_TAB[char_in];
194 h_val ^= MS_TAB(char_out, k);
213 unsigned char char_out,
214 unsigned char char_in)
216 uint64_t h_val = rh_val ^ MS_TAB(char_in & CP_OFF, k);
217 h_val ^= SEED_TAB[char_out & CP_OFF];
231ntc64(
const char* kmer_seq,
unsigned k)
233 uint64_t fh_val = 0, rh_val = 0;
234 fh_val =
ntf64(kmer_seq, k);
235 rh_val =
ntr64(kmer_seq, k);
236 return canonical(fh_val, rh_val);
251ntc64(
const char* kmer_seq,
unsigned k, uint64_t& fh_val, uint64_t& rh_val)
253 fh_val =
ntf64(kmer_seq, k);
254 rh_val =
ntr64(kmer_seq, k);
255 return canonical(fh_val, rh_val);
271 unsigned char char_in,
276 fh_val =
ntf64(fh_val, k, char_out, char_in);
277 rh_val =
ntr64(rh_val, k, char_out, char_in);
278 return canonical(fh_val, rh_val);
294 unsigned char char_out,
295 unsigned char char_in)
297 uint64_t h_val = rh_val ^ MS_TAB(char_in, k);
298 h_val ^= SEED_TAB[char_out];
316 unsigned char char_out,
317 unsigned char char_in)
319 uint64_t h_val =
srol(fh_val);
320 h_val ^= SEED_TAB[char_in & CP_OFF];
321 h_val ^= MS_TAB(char_out & CP_OFF, k);
339 unsigned char char_in,
344 fh_val =
ntf64l(fh_val, k, char_out, char_in);
345 rh_val =
ntr64l(rh_val, k, char_out, char_in);
346 return canonical(fh_val, rh_val);
358nte64(uint64_t bh_val,
unsigned k,
unsigned h, uint64_t* h_val)
362 for (
unsigned i = 1; i < h; i++) {
363 t_val = bh_val * (i ^ k * MULTISEED);
364 t_val ^= t_val >> MULTISHIFT;
378ntmc64(
const char* kmer_seq,
unsigned k,
unsigned m, uint64_t* h_val)
380 const uint64_t b_val =
ntc64(kmer_seq, k);
381 nte64(b_val, k, m, h_val);
396ntmc64(
const char* kmer_seq,
403 const uint64_t b_val =
ntc64(kmer_seq, k, fh_val, rh_val);
404 nte64(b_val, k, m, h_val);
419ntmc64(
unsigned char char_out,
420 unsigned char char_in,
427 const uint64_t b_val =
ntc64(char_out, char_in, k, fh_val, rh_val);
428 nte64(b_val, k, m, h_val);
443ntmc64l(
unsigned char char_out,
444 unsigned char char_in,
451 const uint64_t b_val =
ntc64l(char_out, char_in, k, fh_val, rh_val);
452 nte64(b_val, k, m, h_val);
468ntc64(
const char* kmer_seq,
unsigned k, uint64_t& h_val,
unsigned& loc_n)
472 uint64_t fh_val = 0, rh_val = 0;
473 for (
int i =
int(k - 1); i >= 0; i--) {
474 if (SEED_TAB[(
unsigned char)kmer_seq[i]] == SEED_N) {
478 fh_val =
srol(fh_val);
479 fh_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1 - i]];
481 rh_val =
srol(rh_val);
482 rh_val ^= SEED_TAB[(
unsigned char)kmer_seq[i] & CP_OFF];
484 h_val = canonical(fh_val, rh_val);
502ntmc64(
const char* kmer_seq,
508 uint64_t b_val = 0, fh_val = 0, rh_val = 0;
510 for (
int i =
int(k - 1); i >= 0; i--) {
511 if (SEED_TAB[(
unsigned char)kmer_seq[i]] == SEED_N) {
515 fh_val =
srol(fh_val);
516 fh_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1 - i]];
518 rh_val =
srol(rh_val);
519 rh_val ^= SEED_TAB[(
unsigned char)kmer_seq[i] & CP_OFF];
521 b_val = canonical(fh_val, rh_val);
522 nte64(b_val, k, m, h_val);
548 h_val = fh_val = rh_val = 0;
550 for (
int i =
int(k - 1); i >= 0; i--) {
551 if (SEED_TAB[(
unsigned char)kmer_seq[i]] == SEED_N) {
555 fh_val =
srol(fh_val);
556 fh_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1 - i]];
558 rh_val =
srol(rh_val);
559 rh_val ^= SEED_TAB[(
unsigned char)kmer_seq[i] & CP_OFF];
561 h_val = canonical(fh_val, rh_val);
581ntmc64(
const char* kmer_seq,
592 for (
int i =
int(k - 1); i >= 0; i--) {
593 if (SEED_TAB[(
unsigned char)kmer_seq[i]] == SEED_N) {
597 fh_val =
srol(fh_val);
598 fh_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1 - i]];
600 rh_val =
srol(rh_val);
601 rh_val ^= SEED_TAB[(
unsigned char)kmer_seq[i] & CP_OFF];
603 b_val = canonical(fh_val, rh_val);
604 nte64(b_val, k, m, h_val);
626ntmc64(
const char* kmer_seq,
638 for (
int i =
int(k - 1); i >= 0; i--) {
639 if (SEED_TAB[(
unsigned char)kmer_seq[i]] == SEED_N) {
643 fh_val =
srol(fh_val);
644 fh_val ^= SEED_TAB[(
unsigned char)kmer_seq[k - 1 - i]];
646 rh_val =
srol(rh_val);
647 rh_val ^= SEED_TAB[(
unsigned char)kmer_seq[i] & CP_OFF];
649 h_stn = rh_val < fh_val;
650 b_val = canonical(fh_val, rh_val);
651 nte64(b_val, k, m, h_val);
669ntmc64(
unsigned char char_out,
670 unsigned char char_in,
678 const uint64_t b_val =
ntc64(char_out, char_in, k, fh_val, rh_val);
679 h_stn = rh_val < fh_val;
680 nte64(b_val, k, m, h_val);
699 const char* seed_seq,
700 const char* kmer_seq,
703 uint64_t fs_val = fk_val, rs_val = rk_val;
704 for (
unsigned i = 0; i < k; i++) {
705 if (seed_seq[i] !=
'1') {
706 fs_val ^= MS_TAB((
unsigned char)kmer_seq[i], k - 1 - i);
707 rs_val ^= MS_TAB((
unsigned char)kmer_seq[i] & CP_OFF, i);
710 return canonical(fs_val, rs_val);
730 const char* kmer_seq,
731 const std::vector<unsigned>& positions,
732 const std::vector<unsigned char>& new_bases,
739 for (
size_t i = 0; i < positions.size(); i++) {
740 const auto pos = positions[i];
741 const auto new_base = new_bases[i];
743 fh_val ^= MS_TAB((
unsigned char)kmer_seq[pos], k - 1 - pos);
744 fh_val ^= MS_TAB(new_base, k - 1 - pos);
746 rh_val ^= MS_TAB((
unsigned char)kmer_seq[pos] & CP_OFF, pos);
747 rh_val ^= MS_TAB(new_base & CP_OFF, pos);
750 b_val = canonical(fh_val, rh_val);
751 nte64(b_val, k, m, h_val);
779ntmsm64(
const char* kmer_seq,
780 const std::vector<SpacedSeedBlocks>& seeds_blocks,
781 const std::vector<SpacedSeedMonomers>& seeds_monomers,
785 uint64_t* fh_nomonos,
786 uint64_t* rh_nomonos,
793 uint64_t fh_seed, rh_seed;
794 for (
unsigned i_seed = 0; i_seed < m; i_seed++) {
798 for (
const auto& block : seeds_blocks[i_seed]) {
799 for (
unsigned pos = block[0]; pos < block[1]; pos++) {
800 if (kmer_seq[pos] == SEED_N) {
804 fh_seed ^= MS_TAB((
unsigned char)kmer_seq[pos], k - 1 - pos);
805 rh_seed ^= MS_TAB((
unsigned char)kmer_seq[pos] & CP_OFF, pos);
808 fh_nomonos[i_seed] = fh_seed;
809 rh_nomonos[i_seed] = rh_seed;
810 for (
const unsigned pos : seeds_monomers[i_seed]) {
811 fh_seed ^= MS_TAB((
unsigned char)kmer_seq[pos], k - 1 - pos);
812 rh_seed ^= MS_TAB((
unsigned char)kmer_seq[pos] & CP_OFF, pos);
814 fh_val[i_seed] = fh_seed;
815 rh_val[i_seed] = rh_seed;
816 i_base = i_seed * m2;
817 h_val[i_base] = canonical(fh_seed, rh_seed);
818 for (
unsigned i_hash = 1; i_hash < m2; i_hash++) {
819 h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED);
820 h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT;
826#define NTMSM64(ROL_HANDLING, IN_HANDLING, OUT_HANDLING, ROR_HANDLING) \
827 unsigned char char_out, char_in; \
828 uint64_t fh_seed, rh_seed; \
829 unsigned i_out, i_in, i_base; \
830 for (unsigned i_seed = 0; i_seed < m; i_seed++) { \
832 for (const auto& block : seeds_blocks[i_seed]) \
836 fh_seed ^= MS_TAB(char_out, k - i_out); \
837 fh_seed ^= MS_TAB(char_in, k - i_in); \
838 rh_seed ^= MS_TAB(char_out & CP_OFF, i_out); \
839 rh_seed ^= MS_TAB(char_in & CP_OFF, i_in); \
842 fh_nomonos[i_seed] = fh_seed; \
843 rh_nomonos[i_seed] = rh_seed; \
844 for (const auto& pos : seeds_monomers[i_seed]) { \
845 fh_seed ^= MS_TAB((unsigned char)kmer_seq[pos + 1], k - 1 - pos); \
846 rh_seed ^= MS_TAB((unsigned char)kmer_seq[pos + 1] & CP_OFF, pos); \
848 fh_val[i_seed] = fh_seed; \
849 rh_val[i_seed] = rh_seed; \
850 i_base = i_seed * m2; \
851 h_val[i_base] = canonical(fh_seed, rh_seed); \
852 for (unsigned i_hash = 1; i_hash < m2; i_hash++) { \
853 h_val[i_base + i_hash] = h_val[i_base] * (i_hash ^ k * MULTISEED); \
854 h_val[i_base + i_hash] ^= h_val[i_base + i_hash] >> MULTISHIFT; \
880ntmsm64(
const char* kmer_seq,
881 const std::vector<SpacedSeedBlocks>& seeds_blocks,
882 const std::vector<SpacedSeedMonomers>& seeds_monomers,
886 uint64_t* fh_nomonos,
887 uint64_t* rh_nomonos,
892 NTMSM64(fh_seed =
srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
894 char_in = (
unsigned char)kmer_seq[i_in];
896 char_out = (
unsigned char)kmer_seq[i_out];
897 , rh_seed =
sror(rh_seed);)
922ntmsm64l(
const char* kmer_seq,
923 const std::vector<SpacedSeedBlocks>& seeds_blocks,
924 const std::vector<SpacedSeedMonomers>& seeds_monomers,
928 uint64_t* fh_nomonos,
929 uint64_t* rh_nomonos,
934 NTMSM64(fh_seed = fh_nomonos[i_seed]; rh_seed =
srol(rh_nomonos[i_seed]);
936 char_in = (
unsigned char)kmer_seq[i_in];
938 char_out = (
unsigned char)kmer_seq[i_out];
939 , fh_seed =
sror(fh_seed);)
964ntmsm64(
const char* kmer_seq,
966 const std::vector<SpacedSeedBlocks>& seeds_blocks,
967 const std::vector<SpacedSeedMonomers>& seeds_monomers,
971 uint64_t* fh_nomonos,
972 uint64_t* rh_nomonos,
978 fh_seed =
srol(fh_nomonos[i_seed]); rh_seed = rh_nomonos[i_seed];
980 if (i_in > k - 1) { char_in = in; }
else {
981 char_in = (
unsigned char)kmer_seq[i_in];
984 char_out = (
unsigned char)kmer_seq[i_out];
985 , rh_seed =
sror(rh_seed);)
1010ntmsm64l(
const char* kmer_seq,
1012 const std::vector<SpacedSeedBlocks>& seeds_blocks,
1013 const std::vector<SpacedSeedMonomers>& seeds_monomers,
1017 uint64_t* fh_nomonos,
1018 uint64_t* rh_nomonos,
1024 fh_seed = fh_nomonos[i_seed]; rh_seed =
srol(rh_nomonos[i_seed]);
1026 if (i_in > k - 1) { char_in = in; }
else {
1027 char_in = (
unsigned char)kmer_seq[i_in];
1030 char_out = (
unsigned char)kmer_seq[i_out];
1031 , fh_seed =
sror(fh_seed);)
uint64_t ntf64l(uint64_t rh_val, unsigned k, unsigned char char_out, unsigned char char_in)
Definition nthash_lowlevel.hpp:292
void sub_hash(uint64_t fh_val, uint64_t rh_val, const char *kmer_seq, const std::vector< unsigned > &positions, const std::vector< unsigned char > &new_bases, unsigned k, unsigned m, uint64_t *h_val)
Definition nthash_lowlevel.hpp:728
uint64_t ntr64l(uint64_t fh_val, unsigned k, unsigned char char_out, unsigned char char_in)
Definition nthash_lowlevel.hpp:314
uint64_t srol(const uint64_t x)
Definition nthash_lowlevel.hpp:54
uint64_t mask_hash(uint64_t &fk_val, uint64_t &rk_val, const char *seed_seq, const char *kmer_seq, unsigned k)
Definition nthash_lowlevel.hpp:697
uint64_t ntc64l(unsigned char char_out, unsigned char char_in, unsigned k, uint64_t &fh_val, uint64_t &rh_val)
Definition nthash_lowlevel.hpp:338
uint64_t sror(const uint64_t x)
Definition nthash_lowlevel.hpp:88
uint64_t ntc64(const char *kmer_seq, unsigned k)
Definition nthash_lowlevel.hpp:231
uint64_t ntr64(const char *kmer_seq, unsigned k)
Definition nthash_lowlevel.hpp:144
void nte64(uint64_t bh_val, unsigned k, unsigned h, uint64_t *h_val)
Definition nthash_lowlevel.hpp:358
uint64_t ntf64(const char *kmer_seq, unsigned k)
Definition nthash_lowlevel.hpp:103