From f32bfe95c6212bd1c9212da8778299a92e603714 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Mon, 1 Apr 2024 22:22:43 +1100 Subject: [PATCH] std/crc64: optimize for x86+SSE4.2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old speed new speed delta wuffs_crc64_ecma_10k/clang14 1.78GB/s ± 0% 5.79GB/s ± 0% +225.48% (p=0.008 n=5+5) wuffs_crc64_ecma_100k/clang14 1.78GB/s ± 0% 6.09GB/s ± 0% +241.67% (p=0.008 n=5+5) wuffs_crc64_ecma_10k/gcc12 2.05GB/s ± 1% 5.79GB/s ± 1% +182.80% (p=0.008 n=5+5) wuffs_crc64_ecma_100k/gcc12 2.01GB/s ± 0% 6.09GB/s ± 0% +202.64% (p=0.008 n=5+5) wuffs_xz_decode_100k/clang14 57.9MB/s ± 1% 60.2MB/s ± 0% +4.11% (p=0.008 n=5+5) wuffs_xz_decode_100k/gcc12 57.0MB/s ± 0% 57.8MB/s ± 1% +1.39% (p=0.008 n=5+5) $ time example-mzcat < linux-6.8.2.tar.xz > /dev/null Before: user 0m8.122s After: user 0m8.115s --- internal/cgen/builtin.go | 2 +- release/c/wuffs-unsupported-snapshot.c | 213 ++++++++++++++++-- script/print-crc64-x86-sse42-magic-numbers.go | 110 +++++++++ std/crc64/common_crc64.wuffs | 20 +- std/crc64/common_up_x86_sse42.wuffs | 145 ++++++++++++ 5 files changed, 466 insertions(+), 24 deletions(-) create mode 100644 script/print-crc64-x86-sse42-magic-numbers.go create mode 100644 std/crc64/common_up_x86_sse42.wuffs diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go index b7aec7915..cf3317130 100644 --- a/internal/cgen/builtin.go +++ b/internal/cgen/builtin.go @@ -648,7 +648,7 @@ func (g *gen) writeBuiltinCPUArchX86(b *buffer, recv *a.Expr, method t.ID, retur case "make_m128i_single_u32": fName, tName = "_mm_cvtsi32_si128", "int32_t" case "make_m128i_single_u64": - fName, tName = "_mm_cvtsi64x_si128", "int64_t" + fName, tName = "_mm_cvtsi64_si128", "int64_t" case "make_m128i_slice128", "make_m128i_slice_u16lex8": fName, tName, ptr = "_mm_lddqu_si128", "const __m128i*)(const void*", true case "make_m128i_zeroes": diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c index ba174dbca..d61de3aa1 100644 --- a/release/c/wuffs-unsupported-snapshot.c +++ b/release/c/wuffs-unsupported-snapshot.c @@ -8468,6 +8468,10 @@ struct wuffs_crc64__ecma_hasher__struct { wuffs_base__vtable null_vtable; uint64_t f_state; + + wuffs_base__empty_struct (*choosy_up)( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x); } private_impl; #ifdef __cplusplus @@ -36626,10 +36630,54 @@ WUFFS_CRC64__ECMA_TABLE[8][256] WUFFS_BASE__POTENTIALLY_UNUSED = { }, }; +static const uint8_t +WUFFS_CRC64__SHUFFLE_707F[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 112u, 113u, 114u, 115u, 116u, 117u, 118u, 119u, + 120u, 121u, 122u, 123u, 124u, 125u, 126u, 127u, +}; + +static const uint8_t +WUFFS_CRC64__SHUFFLE_8F80[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 143u, 142u, 141u, 140u, 139u, 138u, 137u, 136u, + 135u, 134u, 133u, 132u, 131u, 130u, 129u, 128u, +}; + +static const uint8_t +WUFFS_CRC64__ECMA_X86_SSE42_K1K2[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 228u, 58u, 57u, 202u, 151u, 212u, 93u, 224u, + 64u, 95u, 135u, 199u, 175u, 149u, 190u, 218u, +}; + +static const uint8_t +WUFFS_CRC64__ECMA_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 133u, 30u, 14u, 175u, 43u, 175u, 216u, 146u, + 213u, 99u, 41u, 23u, 108u, 70u, 62u, 156u, +}; + // ---------------- Private Initializer Prototypes // ---------------- Private Function Prototypes +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up__choosy_default( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x); + +#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up_x86_sse42( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x); +#endif // defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY) + // ---------------- VTables const wuffs_base__hasher_u64__func_ptrs @@ -36687,6 +36735,8 @@ wuffs_crc64__ecma_hasher__initialize( } } + self->private_impl.choosy_up = &wuffs_crc64__ecma_hasher__up__choosy_default; + self->private_impl.magic = WUFFS_BASE__MAGIC; self->private_impl.vtable_for__wuffs_base__hasher_u64.vtable_name = wuffs_base__hasher_u64__vtable_name; @@ -36770,6 +36820,50 @@ wuffs_crc64__ecma_hasher__update( return wuffs_base__make_empty_struct(); } + if (self->private_impl.f_state == 0u) { + self->private_impl.choosy_up = ( +#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc64__ecma_hasher__up_x86_sse42 : +#endif + self->private_impl.choosy_up); + } + wuffs_crc64__ecma_hasher__up(self, a_x); + return wuffs_base__make_empty_struct(); +} + +// -------- func crc64.ecma_hasher.update_u64 + +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC uint64_t +wuffs_crc64__ecma_hasher__update_u64( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x) { + if (!self) { + return 0; + } + if (self->private_impl.magic != WUFFS_BASE__MAGIC) { + return 0; + } + + wuffs_crc64__ecma_hasher__update(self, a_x); + return wuffs_crc64__ecma_hasher__checksum_u64(self); +} + +// -------- func crc64.ecma_hasher.up + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x) { + return (*self->private_impl.choosy_up)(self, a_x); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up__choosy_default( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x) { uint64_t v_s = 0; wuffs_base__slice_u8 v_p = {0}; @@ -36810,24 +36904,6 @@ wuffs_crc64__ecma_hasher__update( return wuffs_base__make_empty_struct(); } -// -------- func crc64.ecma_hasher.update_u64 - -WUFFS_BASE__GENERATED_C_CODE -WUFFS_BASE__MAYBE_STATIC uint64_t -wuffs_crc64__ecma_hasher__update_u64( - wuffs_crc64__ecma_hasher* self, - wuffs_base__slice_u8 a_x) { - if (!self) { - return 0; - } - if (self->private_impl.magic != WUFFS_BASE__MAGIC) { - return 0; - } - - wuffs_crc64__ecma_hasher__update(self, a_x); - return wuffs_crc64__ecma_hasher__checksum_u64(self); -} - // -------- func crc64.ecma_hasher.checksum_u64 WUFFS_BASE__GENERATED_C_CODE @@ -36845,6 +36921,107 @@ wuffs_crc64__ecma_hasher__checksum_u64( return self->private_impl.f_state; } +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func crc64.ecma_hasher.up_x86_sse42 + +#if defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_crc64__ecma_hasher__up_x86_sse42( + wuffs_crc64__ecma_hasher* self, + wuffs_base__slice_u8 a_x) { + uint64_t v_s = 0; + wuffs_base__slice_u8 v_p = {0}; + __m128i v_s0 = {0}; + __m128i v_s0_707F = {0}; + __m128i v_s0_8F80 = {0}; + __m128i v_x0 = {0}; + __m128i v_aa = {0}; + __m128i v_k1k2 = {0}; + __m128i v_t0 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_u0 = {0}; + __m128i v_u1 = {0}; + __m128i v_u2 = {0}; + __m128i v_v0 = {0}; + __m128i v_v1 = {0}; + __m128i v_pxmu = {0}; + __m128i v_w1 = {0}; + __m128i v_w2 = {0}; + uint64_t v_tail_index = 0; + + v_s = (18446744073709551615u ^ self->private_impl.f_state); + while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) { + v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u)); + a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u); + } + if (((uint64_t)(a_x.len)) < 32u) { + { + wuffs_base__slice_u8 i_slice_p = a_x; + v_p.ptr = i_slice_p.ptr; + v_p.len = 1; + uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len; + while (v_p.ptr < i_end0_p) { + v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u)); + v_p.ptr += 1; + } + v_p.len = 0; + } + self->private_impl.f_state = (18446744073709551615u ^ v_s); + return wuffs_base__make_empty_struct(); + } + v_s0 = _mm_cvtsi64_si128((int64_t)(v_s)); + v_s0_707F = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_707F))); + v_s0_8F80 = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_8F80))); + v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)); + a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u); + v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_K1K2)); + v_t0 = _mm_xor_si128(v_s0_707F, v_x0); + v_t1 = _mm_clmulepi64_si128(v_t0, v_k1k2, (int32_t)(0u)); + v_t2 = _mm_clmulepi64_si128(v_t0, v_k1k2, (int32_t)(17u)); + v_aa = _mm_xor_si128(_mm_xor_si128(v_t1, v_t2), v_s0_8F80); + while (((uint64_t)(a_x.len)) >= 32u) { + v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)); + a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u); + v_u0 = _mm_xor_si128(v_aa, v_x0); + v_u1 = _mm_clmulepi64_si128(v_u0, v_k1k2, (int32_t)(0u)); + v_u2 = _mm_clmulepi64_si128(v_u0, v_k1k2, (int32_t)(17u)); + v_aa = _mm_xor_si128(v_u1, v_u2); + } + if (((uint64_t)(a_x.len)) < 16u) { + return wuffs_base__make_empty_struct(); + } + v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)); + a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u); + v_v0 = _mm_xor_si128(v_aa, v_x0); + v_v1 = _mm_clmulepi64_si128(v_v0, v_k1k2, (int32_t)(16u)); + v_aa = _mm_xor_si128(v_v1, _mm_srli_si128(v_v0, (int32_t)(8u))); + v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_PXMU)); + v_w1 = _mm_clmulepi64_si128(v_aa, v_pxmu, (int32_t)(16u)); + v_w2 = _mm_clmulepi64_si128(v_w1, v_pxmu, (int32_t)(0u)); + v_s = ((uint64_t)(_mm_extract_epi64(_mm_xor_si128(v_aa, _mm_xor_si128(v_w2, _mm_slli_si128(v_w1, (int32_t)(8u)))), (int32_t)(1u)))); + v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551600u); + if (v_tail_index < ((uint64_t)(a_x.len))) { + { + wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index); + v_p.ptr = i_slice_p.ptr; + v_p.len = 1; + uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len; + while (v_p.ptr < i_end0_p) { + v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u)); + v_p.ptr += 1; + } + v_p.len = 0; + } + } + self->private_impl.f_state = (18446744073709551615u ^ v_s); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_BASE__CPU_ARCH__X86_FAMILY) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + #endif // !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__CRC64) #if !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__DEFLATE) diff --git a/script/print-crc64-x86-sse42-magic-numbers.go b/script/print-crc64-x86-sse42-magic-numbers.go new file mode 100644 index 000000000..adc0695a7 --- /dev/null +++ b/script/print-crc64-x86-sse42-magic-numbers.go @@ -0,0 +1,110 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//go:build ignore +// +build ignore + +package main + +// print-crc64-x86-sse42-magic-numbers.go prints the std/crc64 +// ECMA_X86_SSE42_ETC magic number tables. +// +// It is like print-crc32-x86-sse42-magic-numbers.go but for CRC-64/ECMA, not +// CRC-32/IEEE. +// +// Usage: go run print-crc64-x86-sse42-magic-numbers.go +// +// Output: +// Px' = 0x92D8_AF2B_AF0E_1E85 +// k1' = 0xE05D_D497_CA39_3AE4 +// k2' = 0xDABE_95AF_C787_5F40 +// μ' = 0x9C3E_466C_1729_63D5 + +import ( + "fmt" + "strings" +) + +// px is the P(x) polynomial, a bit-reversal (with explicit high bit) of the +// CRC-64/ECMA polynomial sometimes written as 0xC96C_5795_D787_0F42. +// P(x) = 0x1_42F0_E1EB_A9EA_3693 +// P(x) = 0b1_01000010_11110000_11100001_11101011_10101001_11101010_00110110_10010011 +const px = "10100001011110000111000011110101110101001111010100011011010010011" + +var spaces = strings.Repeat(" ", 1024) + +func debugf(format string, a ...interface{}) { + if false { // Change false to true to show the long divisions. + fmt.Printf(format, a...) + } +} + +func show(name string, value string) { + v := uint64(0) + if strings.Contains(name, "'") { + for i := len(value) - 1; i >= 0; i-- { + v = (v << 1) | uint64(value[i]&1) + } + } else { + for i := 0; i < len(value); i++ { + v = (v << 1) | uint64(value[i]&1) + } + } + fmt.Printf("%s = 0x%04X_%04X_%04X_%04X\n", name, + 0xFFFF&(v>>48), 0xFFFF&(v>>32), 0xFFFF&(v>>16), 0xFFFF&(v>>0)) +} + +func calcKn(name string, power int) { + numerator := "1" + strings.Repeat("0", power) + b := []byte(numerator) + i := 0 + debugf(" %s\n", numerator) + for i+len(px) <= len(numerator) { + for j := 0; j < len(px); j++ { + b[i+j] ^= 1 & px[j] + } + debugf(" %s%s\n", spaces[:i], px) + for ; (i < len(b)) && (b[i] == '0'); i++ { + b[i] = ' ' + } + debugf(" %s\n", b) + } + show(name, string(b[len(b)-len(px):])) +} + +func calcMu(name string) { + numerator := "1" + strings.Repeat("0", 128) + mu := make([]byte, 129) + b := []byte(numerator) + i := 0 + debugf(" %s\n", numerator) + for i+len(px) <= len(numerator) { + for j := 0; j < len(px); j++ { + b[i+j] ^= 1 & px[j] + } + debugf(" %s%s\n", spaces[:i], px) + b[i] = ':' + mu[i] = '1' + i++ + for ; (i < len(b)) && (b[i] == '0'); i++ { + b[i] = ' ' + mu[i] = '0' + } + debugf(" %s\n", b) + } + show(name, string(mu[:65])) +} + +func main() { + show("Px'", px) + calcKn("k1'", 128+64) + calcKn("k2'", 128) + calcMu("μ' ") +} diff --git a/std/crc64/common_crc64.wuffs b/std/crc64/common_crc64.wuffs index e084ed6e3..20e0a11b8 100644 --- a/std/crc64/common_crc64.wuffs +++ b/std/crc64/common_crc64.wuffs @@ -21,6 +21,21 @@ pub func ecma_hasher.set_quirk!(key: base.u32, value: base.u64) base.status { } pub func ecma_hasher.update!(x: roslice base.u8) { + if this.state == 0 { + choose up = [ + up_x86_sse42] + } + this.up!(x: args.x) +} + +pub func ecma_hasher.update_u64!(x: roslice base.u8) base.u64 { + this.update!(x: args.x) + return this.checksum_u64() +} + +pri func ecma_hasher.up!(x: roslice base.u8), + choosy, +{ var s : base.u64 var p : roslice base.u8 @@ -51,11 +66,6 @@ pub func ecma_hasher.update!(x: roslice base.u8) { this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s } -pub func ecma_hasher.update_u64!(x: roslice base.u8) base.u64 { - this.update!(x: args.x) - return this.checksum_u64() -} - pub func ecma_hasher.checksum_u64() base.u64 { return this.state } diff --git a/std/crc64/common_up_x86_sse42.wuffs b/std/crc64/common_up_x86_sse42.wuffs new file mode 100644 index 000000000..189a175fd --- /dev/null +++ b/std/crc64/common_up_x86_sse42.wuffs @@ -0,0 +1,145 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// -------- + +// Like std/crc32's x86 SIMD implementation, this is based on Gopal et al. +// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction". + +pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8), + choose cpu_arch >= x86_sse42, +{ + var s : base.u64 + var p : roslice base.u8 + + var util : base.x86_sse42_utility + var s0 : base.x86_m128i + var s0_707F : base.x86_m128i + var s0_8F80 : base.x86_m128i + var x0 : base.x86_m128i + var aa : base.x86_m128i + var k1k2 : base.x86_m128i + var t0 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var u0 : base.x86_m128i + var u1 : base.x86_m128i + var u2 : base.x86_m128i + var v0 : base.x86_m128i + var v1 : base.x86_m128i + var pxmu : base.x86_m128i + var w1 : base.x86_m128i + var w2 : base.x86_m128i + + var tail_index : base.u64 + + s = 0xFFFF_FFFF_FFFF_FFFF ^ this.state + + // Align to a 16-byte boundary. + while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) { + s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8) + args.x = args.x[1 ..] + } endwhile + + // For short inputs, just do a simple loop. + if args.x.length() < 0x20 { + iterate (p = args.x)(length: 1, advance: 1, unroll: 1) { + s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) + } + this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s + return nothing + } + + // Process N 16-byte chunks (1 on ramp, N-2 main loop, 1 off ramp). The + // algorithm is loosely based on https://github.com/rawrunprotected/crc but + // modified to always start and end on 16-byte alignment boundaries. It + // also doesn't crash on zero-length input. + + // On ramp. + + s0 = util.make_m128i_single_u64(a: s) + s0_707F = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_707F[.. 16])) + s0_8F80 = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_8F80[.. 16])) + + x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) + args.x = args.x[0x10 ..] + + k1k2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_K1K2[.. 16]) + t0 = s0_707F._mm_xor_si128(b: x0) + t1 = t0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00) + t2 = t0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11) + aa = t1._mm_xor_si128(b: t2)._mm_xor_si128(b: s0_8F80) + + // Main loop. + + while args.x.length() >= 0x20 { + x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) + args.x = args.x[0x10 ..] + + u0 = aa._mm_xor_si128(b: x0) + u1 = u0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00) + u2 = u0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11) + aa = u1._mm_xor_si128(b: u2) + } endwhile + + // Off ramp. + + if args.x.length() < 0x10 { + // Unreachable, but the "if" is needed by the bounds checker. + return nothing + } + x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) + args.x = args.x[0x10 ..] + + v0 = aa._mm_xor_si128(b: x0) + v1 = v0._mm_clmulepi64_si128(b: k1k2, imm8: 0x10) + aa = v1._mm_xor_si128(b: v0._mm_srli_si128(imm8: 8)) + + // Barrett reduction. + pxmu = util.make_m128i_slice128(a: ECMA_X86_SSE42_PXMU[.. 16]) + w1 = aa._mm_clmulepi64_si128(b: pxmu, imm8: 0x10) + w2 = w1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00) + s = aa._mm_xor_si128(b: + w2._mm_xor_si128(b: + w1._mm_slli_si128(imm8: 8)))._mm_extract_epi64(imm8: 1) + + // Handle the tail of args.x that wasn't a complete 16-byte chunk. + tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFF0 // And-not 16. + if tail_index < args.x.length() { + iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) { + s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) + } + } + + this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s +} + +pri const SHUFFLE_707F : roarray[16] base.u8 = [ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, +] + +pri const SHUFFLE_8F80 : roarray[16] base.u8 = [ + 0x8F, 0x8E, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x88, + 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, +] + +// These constants are reproduced by +// script/print-crc64-x86-sse42-magic-numbers.go + +pri const ECMA_X86_SSE42_K1K2 : roarray[16] base.u8 = [ + 0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // k1' = 0xE05D_D497_CA39_3AE4 + 0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // k2' = 0xDABE_95AF_C787_5F40 +] + +pri const ECMA_X86_SSE42_PXMU : roarray[16] base.u8 = [ + 0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 + 0xD5, 0x63, 0x29, 0x17, 0x6C, 0x46, 0x3E, 0x9C, // μ' = 0x9C3E_466C_1729_63D5 +]