From 19a27e959e3110ca32d7c4f3106775ccf2cf11cc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Jul 2024 09:45:31 +0100 Subject: [PATCH] Speed up kputll. The kputuw function is considerably faster as it encodes 2 digits at a time and also utilises __builtin_clz. This changes kputll to use the same 2 digits at a time trick. I have a __builtin_clzll variant too, but with longer numbers it's not the main bottleneck and we fall back to kputuw for small numbers. This avoids complicating the code with builtin checks and alternate versions. An alternative, purely for sam_format1_append would be something like: static inline int kputll_fast(long long c, kstring_t *s) { return c <= INT_MAX && c >= INT_MIN ? kputw(c, s) : kputll(c, s); } #define kputll kputll_fast This works as BAM/CRAM only support 32-bit numbers for POS, PNEXT and TLEN anyway, so ll vs w is an irrelevant distinction. However I chose to modify the header file so it fixes other callers. Overall compressed BAM to uncompressed SAM conversion is about 5% quicker (tested on 10 million short-read seqs; it'll be minimal on long seqs). This includes decode time and other functions too. The sam_format1_append only component of that is about 15-25% quicker depending on compiler and version. --- htslib/kstring.h | 68 +++++++++++++++++++++++++++++++------ test/test_kstring.c | 81 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 11 deletions(-) diff --git a/htslib/kstring.h b/htslib/kstring.h index 53a19806d..0a3efb7d2 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s) static inline int kputll(long long c, kstring_t *s) { - char buf[32]; - int i, l = 0; - unsigned long long x = c; - if (c < 0) x = -x; - do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); - if (c < 0) buf[l++] = '-'; - if (ks_resize(s, s->l + l + 2) < 0) - return EOF; - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; + // Worst case expansion. One check reduces function size + // and aids inlining chance. Memory overhead is minimal. + if (ks_resize(s, s->l + 23) < 0) + return EOF; + + unsigned long long x = c; + if (c < 0) { + x = -x; + s->s[s->l++] = '-'; + } + + if (x <= UINT32_MAX) + return kputuw(x, s); + + static const char kputull_dig2r[] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + unsigned int l, j; + char *cp; + + // Find out how long the number is (could consider clzll) + uint64_t m = 1; + l = 0; + if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) { + // avoids overflow below + l = 20; + } else { + do { + l++; + m *= 10; + } while (x >= m); + } + + // Add digits two at a time + j = l; + cp = s->s + s->l; + while (x >= 10) { + const char *d = &kputull_dig2r[2*(x%100)]; + x /= 100; + memcpy(&cp[j-=2], d, 2); + } + + // Last one (if necessary). We know that x < 10 by now. + if (j == 1) + cp[0] = x + '0'; + + s->l += l; + s->s[s->l] = 0; + return 0; } static inline int kputl(long c, kstring_t *s) { diff --git a/test/test_kstring.c b/test/test_kstring.c index feb8243df..f942656f1 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -261,6 +261,84 @@ static int test_kputw(int64_t start, int64_t end) { return 0; } +static int test_kputll_from_to(kstring_t *str, long long s, long long e) { + long long i = s; + + for (;;) { + str->l = 0; + memset(str->s, 0xff, str->m); + if (kputll(i, str) < 0 || !str->s) { + perror("kputll"); + return -1; + } + if (str->l >= str->m || str->s[str->l] != '\0') { + fprintf(stderr, "No NUL termination on string from kputll\n"); + return -1; + } + if (i != strtoll(str->s, NULL, 10)) { + fprintf(stderr, + "kputll wrote the wrong value, expected %lld, got %s\n", + i, str->s); + return -1; + } + if (i >= e) break; + i++; + } + return 0; +} + +static int test_kputll(long long start, long long end) { + kstring_t str = { 0, 0, NULL }; + unsigned long long val; + + str.s = malloc(2); + if (!str.s) { + perror("malloc"); + return -1; + } + str.m = 2; + + for (val = 1; val < INT64_MAX-5; val *= 10) { + if (test_kputll_from_to(&str, val >= 5 ? val - 5 : val, val) < 0) { + free(ks_release(&str)); + return -1; + } + } + + for (val = 1; val < INT64_MAX-5; val *= 10) { + long long valm = -val; + if (test_kputll_from_to(&str, valm >= 5 ? valm - 5 : valm, valm) < 0) { + free(ks_release(&str)); + return -1; + } + } + + if (test_kputll_from_to(&str, INT64_MAX - 5, INT64_MAX) < 0) { + free(ks_release(&str)); + return -1; + } + + if (test_kputll_from_to(&str, INT64_MIN, INT64_MIN + 5) < 0) { + free(ks_release(&str)); + return -1; + } + + str.m = 1; // Force a resize + int64_t start2 = (int64_t)start; // no larger on our platforms + int64_t end2 = (int64_t)end; + clamp(&start2, INT64_MIN, INT64_MAX); + clamp(&end2, INT64_MIN, INT64_MAX); + + if (test_kputll_from_to(&str, start, end) < 0) { + free(ks_release(&str)); + return -1; + } + + free(ks_release(&str)); + + return 0; +} + // callback used by test_kgetline static char *mock_fgets(char *str, int num, void *p) { int *mock_state = (int*)p; @@ -413,6 +491,9 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kputw") == 0) if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kputll") == 0) + if (test_kputll(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kgetline") == 0) if (test_kgetline() != 0) res = EXIT_FAILURE;