Skip to content

Commit

Permalink
Make a dynamic dispatcher function for nibble2base
Browse files Browse the repository at this point in the history
  • Loading branch information
rhpvorderman committed Apr 2, 2024
1 parent 0840b8d commit 5d0a78e
Showing 1 changed file with 50 additions and 24 deletions.
74 changes: 50 additions & 24 deletions sam_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@ DEALINGS IN THE SOFTWARE. */

#include <errno.h>
#include <stdint.h>
#include "htslib/hts_defs.h"
#include "htslib/sam.h"
#ifdef __SSSE3__
#include "tmmintrin.h"
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -70,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
* for (i = 0; i < len; i++)
* seq[i] = seq_nt16_str[bam_seqi(nib, i)];
*/
static inline void nibble2base(uint8_t *nib, char *seq, int len) {
static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
static const char code2base[512] =
"===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
"A=AAACAMAGARASAVATAWAYAHAKADABAN"
Expand All @@ -89,21 +88,42 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";

int i, len2 = len/2;
seq[0] = 0;

for (i = 0; i < len2; i++)
// Note size_t cast helps gcc optimiser.
memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2);

if ((i *= 2) < len)
seq[i] = seq_nt16_str[bam_seqi(nib, i)];
}

#if HTS_GCC_AT_LEAST(4,8)
/*
* Convert a nibble encoded BAM sequence to a string of bases.
*
* Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
* unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
* be converted to the IUPAC characters.
* It falls back on the nibble2base_default function for the remainder.
*/
#include "tmmintrin.h"
__attribute__((target("ssse3")))
static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
seq[0] = 0;
const char *seq_end_ptr = seq + len;
char *seq_cursor = seq;
const uint8_t *nibble_cursor = nib;
const char *seq_end_ptr_twoatatime = seq + (len & (~1ULL));
#ifdef __SSSE3__
uint8_t *nibble_cursor = nib;
const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Expand All @@ -126,36 +146,42 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);

nibble_cursor += sizeof(__m128i);
seq_cursor += 2 * sizeof(__m128i);
}
#endif
while (seq_cursor < seq_end_ptr_twoatatime) {
// Note size_t cast helps gcc optimiser.
memcpy(seq_cursor, code2base + ((size_t)*nibble_cursor * 2), 2);
seq_cursor += 2;
nibble_cursor += 1;
nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
}
static void (*nibble2base)(uint8_t *nib, char *seq, int len);

static void nibble2base_dispatch(uint8_t *nib, char *seq, int len) {
if (__builtin_cpu_supports("ssse3")) {
nibble2base = nibble2base_ssse3;
}
if (seq_cursor != seq_end_ptr) {
/* There is a single encoded nuc left */
uint8_t nibble_c = *nibble_cursor;
uint8_t upper_nuc_index = nibble_c >> 4;
seq_cursor[0] = seq_nt16_str[upper_nuc_index];
else {
nibble2base = nibble2base_default;
}
nibble2base(nib, seq, len);
}

static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_dispatch;

#else
static inline void nibble2base(uint8_t *nib, char *seq, int len) {
nibble2base_default(nib, seq, len);
}
#endif
#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit 5d0a78e

Please sign in to comment.