Skip to content

Commit

Permalink
Enable SSSE3 intrinsics for nibble parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
rhpvorderman committed Sep 26, 2023
1 parent 7994291 commit d739ecd
Showing 1 changed file with 66 additions and 8 deletions.
74 changes: 66 additions & 8 deletions sam_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <stdint.h>
#include "htslib/sam.h"

#ifdef __SSSE3__
#include "tmmintrin.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -87,15 +89,71 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";

int i, len2 = len/2;
seq[0] = 0;

for (i = 0; i < len2; i++)
const char *seq_end_ptr = seq + len;
char *seq_cursor = seq;
const uint8_t *nibble_cursor = nib;
const char *seq_end_ptr_twoatatime = seq + (len & (~1ULL));
#ifdef __SSSE3__
const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
while (seq_cursor < seq_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);

nibble_cursor += sizeof(__m128i);
seq_cursor += 2 * sizeof(__m128i);
}
#endif
while (seq_cursor < seq_end_ptr_twoatatime) {
// Note size_t cast helps gcc optimiser.
memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2);

if ((i *= 2) < len)
seq[i] = seq_nt16_str[bam_seqi(nib, i)];
memcpy(seq_cursor, code2base + ((size_t)*nibble_cursor * 2), 2);
seq_cursor += 2;
nibble_cursor += 1;
}
if (seq_cursor != seq_end_ptr) {
/* There is a single encoded nuc left */
uint8_t nibble_c = *nibble_cursor;
uint8_t upper_nuc_index = nibble_c >> 4;
seq_cursor[0] = seq_nt16_str[upper_nuc_index];
}
}

#ifdef __cplusplus
Expand Down

0 comments on commit d739ecd

Please sign in to comment.