From 2f0762fa8fd30b457bc5dcf53403123212091df5 Mon Sep 17 00:00:00 2001 From: Tim Ruffing Date: Thu, 23 Nov 2023 22:20:02 +0100 Subject: [PATCH] field: Remove x86_64 asm Widely available versions of GCC and Clang beat our field asm on -O2. In particular, GCC 10.5.0, which is Bitcoin Core's current compiler for official x86_64 builds, produces code that is > 20% faster for fe_mul and > 10% faster for signature verification (see #726). These are the alternatives to this PR: We could replace our current asm with the fastest compiler output that we can find. This is potentially faster, but it has multiple drawbacks: - It's more coding work because it needs detailed benchmarks (e.g., with many compiler/options). - It's more review work because we need to deal with inline asm (including clobbers etc.) and there's a lack of experts reviewers in this area. - It's not unlikely that we'll fall behind again in a few compiler versions, and then we have to deal with this again, i.e., redo the benchmarks. Given our history here, I doubt that we'll revolve this timely. We could change the default of the asm build option to off. But this will also disable the scalar asm, which is still faster. We could split the build option into two separate options for field and scalar asm and only disable the field asm by default. But this adds complexity to the build and to the test matrix. My conclusion is that this PR gets the low-hanging fruit in terms of performance. It simplifies our code significantly. It's clearly an improvement, and it's very easy to review. Whether re-introducing better asm (whether from a compiler or from CryptOpt) is worth the hassle can be evaluated separately, and should not hold up this improvement. Solves #726. --- Makefile.am | 1 - README.md | 2 +- src/field_5x52_asm_impl.h | 504 -------------------------------------- src/field_5x52_impl.h | 4 - 4 files changed, 1 insertion(+), 510 deletions(-) delete mode 100644 src/field_5x52_asm_impl.h diff --git a/Makefile.am b/Makefile.am index 9d0dc3dffc299..549861791569d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -37,7 +37,6 @@ noinst_HEADERS += src/field_10x26_impl.h noinst_HEADERS += src/field_5x52.h noinst_HEADERS += src/field_5x52_impl.h noinst_HEADERS += src/field_5x52_int128_impl.h -noinst_HEADERS += src/field_5x52_asm_impl.h noinst_HEADERS += src/modinv32.h noinst_HEADERS += src/modinv32_impl.h noinst_HEADERS += src/modinv64.h diff --git a/README.md b/README.md index 8c325076228e2..1b6eedd32a4d7 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Implementation details * Expose only higher level interfaces to minimize the API surface and improve application security. ("Be difficult to use insecurely.") * Field operations * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1). - * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys). + * Using 5 52-bit limbs * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan). * This is an experimental feature that has not received enough scrutiny to satisfy the standard of quality of this library but is made available for testing and review by the community. * Scalar operations diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h deleted file mode 100644 index 04a9af2105788..0000000000000 --- a/src/field_5x52_asm_impl.h +++ /dev/null @@ -1,504 +0,0 @@ -/*********************************************************************** - * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - -/** - * Changelog: - * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm - * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly - */ - -#ifndef SECP256K1_FIELD_INNER5X52_IMPL_H -#define SECP256K1_FIELD_INNER5X52_IMPL_H - -#include "util.h" - -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" - - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index ecb70502c2be3..76031f755e2da 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -12,11 +12,7 @@ #include "field.h" #include "modinv64_impl.h" -#if defined(USE_ASM_X86_64) -#include "field_5x52_asm_impl.h" -#else #include "field_5x52_int128_impl.h" -#endif #ifdef VERIFY static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {