From 11b105ae5fd256b5ed625f9780910f8a7bfb3e76 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Mon, 28 Aug 2023 20:01:48 +0000 Subject: [PATCH 01/24] Add bignum_copy_row_from_table and its Neon-variants for AArch64 This patch adds constant-time table-lookup functions (`bignum_copy_row_from_table*`) and their proofs. This patch only contains its AArch64 version, and the x86 version will follow later. The failure of proving its x86 version seems to be related to handling negative offsets, and (if this is right) this can be avoided by simply proving positive offsets. I will record this as a Github issue with a promise that the x86 scalar version will be provided after the RSA related things are finished. This patch contains four table-lookup functions: 1. `bignum_copy_row_from_table`: a lookup for a generic table size 2. `bignum_copy_row_from_table_8n_neon`: a Neon version for a table whose width is a multiple of 8 3. `bignum_copy_row_from_table_16_neon`: Neon implementation of a table whose width is 16*64=1024 bits 4. `bignum_copy_row_from_table_32_neon`: Neon implementation of a table whose width is 32*64=2048 bits The last two versions are initially written by Hanno Becker. To successfully compile and run `test` and `benchmark` in x86, the scalar `bignum_copy_row_from_table` function is processed as a way similar to Neon functions. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/f1ad23c1776b35309e9a309d7d1b5a102e33735f --- arm/generic/bignum_copy_row_from_table.S | 81 ++++++++++++++ .../bignum_copy_row_from_table_8n_neon.S | 102 ++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 arm/generic/bignum_copy_row_from_table.S create mode 100644 arm/generic/bignum_copy_row_from_table_8n_neon.S diff --git a/arm/generic/bignum_copy_row_from_table.S b/arm/generic/bignum_copy_row_from_table.S new file mode 100644 index 0000000000..ba3e48d061 --- /dev/null +++ b/arm/generic/bignum_copy_row_from_table.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] +// into z[0..width-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// +// extern void bignum_copy_from_table +// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, +// uint64_t idx); +// +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table) + .text + .balign 4 + +#define z x0 +#define table x1 +#define height x2 +#define width x3 +#define idx x4 + +#define i x5 +#define mask x6 +#define j x7 + +S2N_BN_SYMBOL(bignum_copy_row_from_table): + + cbz height, bignum_copy_row_from_table_end + cbz width, bignum_copy_row_from_table_end + mov i, width + mov x6, z + +bignum_copy_row_from_table_initzero: + str xzr, [x6] + add x6, x6, #8 + subs i, i, #1 + bne bignum_copy_row_from_table_initzero + + mov i, xzr + mov x8, table + +bignum_copy_row_from_table_outerloop: + + cmp i, idx + csetm mask, eq + + mov j, width + mov x9, z + +bignum_copy_row_from_table_innerloop: + + ldr x10, [x8] + ldr x11, [x9] + and x10, x10, mask + orr x11, x11, x10 + str x11, [x9] + + add x8, x8, #8 + add x9, x9, #8 + subs j, j, #1 + bne bignum_copy_row_from_table_innerloop + +bignum_copy_row_from_table_innerloop_done: + add i, i, #1 + cmp i, height + bne bignum_copy_row_from_table_outerloop + +bignum_copy_row_from_table_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/generic/bignum_copy_row_from_table_8n_neon.S b/arm/generic/bignum_copy_row_from_table_8n_neon.S new file mode 100644 index 0000000000..80db20d6b6 --- /dev/null +++ b/arm/generic/bignum_copy_row_from_table_8n_neon.S @@ -0,0 +1,102 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] +// into z[0..width-1]. width must be a mutiple of 8. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// +// extern void bignum_copy_from_table_8_neon +// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, uint64_t idx); +// +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_8n_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_8n_neon) + .text + .balign 4 + + +#define z x0 +#define table x1 +#define height x2 +#define width x3 +#define idx x4 + +#define i x5 +#define mask x6 +#define j x7 + +#define vmask v16 + +S2N_BN_SYMBOL(bignum_copy_row_from_table_8n_neon): + + cbz height, bignum_copy_row_from_table_8n_neon_end + cbz width, bignum_copy_row_from_table_8n_neon_end + mov i, width + mov x6, z + dup v16.2d, xzr + +bignum_copy_row_from_table_8n_neon_initzero: + str q16, [x6] + str q16, [x6, #16] + str q16, [x6, #32] + str q16, [x6, #48] + add x6, x6, #64 + subs i, i, #8 + bne bignum_copy_row_from_table_8n_neon_initzero + + mov i, xzr + mov x8, table + +bignum_copy_row_from_table_8n_neon_outerloop: + + cmp i, idx + csetm mask, eq + dup vmask.2d, mask + + mov j, width + mov x9, z + +bignum_copy_row_from_table_8n_neon_innerloop: + + ldr q17, [x8] + ldr q18, [x9] + bit v18.16b, v17.16b, vmask.16b + str q18, [x9] + + ldr q17, [x8, #16] + ldr q18, [x9, #16] + bit v18.16b, v17.16b, vmask.16b + str q18, [x9, #16] + + ldr q17, [x8, #32] + ldr q18, [x9, #32] + bit v18.16b, v17.16b, vmask.16b + str q18, [x9, #32] + + ldr q17, [x8, #48] + ldr q18, [x9, #48] + bit v18.16b, v17.16b, vmask.16b + str q18, [x9, #48] + + add x8, x8, #64 + add x9, x9, #64 + subs j, j, #8 + bne bignum_copy_row_from_table_8n_neon_innerloop + +bignum_copy_row_from_table_8n_neon_innerloop_done: + add i, i, #1 + cmp i, height + bne bignum_copy_row_from_table_8n_neon_outerloop + +bignum_copy_row_from_table_8n_neon_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From bb3baa5dcb8da3ee878ac41fb415332d71db4bd6 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 4 Oct 2023 17:22:14 -0700 Subject: [PATCH 02/24] Add Ed25519 point compression encoding This implements the point compression encoding to a byte array from https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2 as function "edwards25519_encode". It assumes the input is a point (x,y) on the edwards25519 curve, with coordinates reduced mod p_25519 = 2^255 - 19, and does not check any of that. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/67430bea9f3cc95ce40074ec632d732676d9d63a --- arm/curve25519/edwards25519_encode.S | 131 +++++++++++++++++++++++ x86_att/curve25519/edwards25519_encode.S | 81 ++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 arm/curve25519/edwards25519_encode.S create mode 100644 x86_att/curve25519/edwards25519_encode.S diff --git a/arm/curve25519/edwards25519_encode.S b/arm/curve25519/edwards25519_encode.S new file mode 100644 index 0000000000..4cf301a227 --- /dev/null +++ b/arm/curve25519/edwards25519_encode.S @@ -0,0 +1,131 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Encode edwards25519 point into compressed form as 256-bit number +// Input p[8]; output z[32] (bytes) +// +// extern void edwards25519_encode +// (uint8_t z[static 32], uint64_t p[static 8]); +// +// This assumes that the input buffer p points to a pair of 256-bit +// numbers x (at p) and y (at p+4) representing a point (x,y) on the +// edwards25519 curve. It is assumed that both x and y are < p_25519 +// but there is no checking of this, nor of the fact that (x,y) is +// in fact on the curve. +// +// The output in z is a little-endian array of bytes corresponding to +// the standard compressed encoding of a point as 2^255 * x_0 + y +// where x_0 is the least significant bit of x. +// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2" +// In this implementation, y is simply truncated to 255 bits, but if +// it is reduced mod p_25519 as expected this does not affect values. +// +// Standard ARM ABI: X0 = z, X1 = p +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_encode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_encode) + .text + .balign 4 + +#define z x0 +#define p x1 + +#define y0 x2 +#define y1 x3 +#define y2 x4 +#define y3 x5 +#define y0short w2 +#define y1short w3 +#define y2short w4 +#define y3short w5 +#define xb x6 + +S2N_BN_SYMBOL(edwards25519_encode): + +// Load lowest word of x coordinate in xb and full y as [y3;y2;y1;y0]. + + ldr xb, [p] + ldp y0, y1, [p, #32] + ldp y2, y3, [p, #48] + +// Compute the encoded form, making the LSB of x the MSB of the encoding + + and y3, y3, #0x7FFFFFFFFFFFFFFF + orr y3, y3, xb, lsl #63 + +// Write back in a byte-oriented fashion to be independent of endianness + + strb y0short, [z] + lsr y0, y0, #8 + strb y0short, [z, #1] + lsr y0, y0, #8 + strb y0short, [z, #2] + lsr y0, y0, #8 + strb y0short, [z, #3] + lsr y0, y0, #8 + strb y0short, [z, #4] + lsr y0, y0, #8 + strb y0short, [z, #5] + lsr y0, y0, #8 + strb y0short, [z, #6] + lsr y0, y0, #8 + strb y0short, [z, #7] + + strb y1short, [z, #8] + lsr y1, y1, #8 + strb y1short, [z, #9] + lsr y1, y1, #8 + strb y1short, [z, #10] + lsr y1, y1, #8 + strb y1short, [z, #11] + lsr y1, y1, #8 + strb y1short, [z, #12] + lsr y1, y1, #8 + strb y1short, [z, #13] + lsr y1, y1, #8 + strb y1short, [z, #14] + lsr y1, y1, #8 + strb y1short, [z, #15] + + strb y2short, [z, #16] + lsr y2, y2, #8 + strb y2short, [z, #17] + lsr y2, y2, #8 + strb y2short, [z, #18] + lsr y2, y2, #8 + strb y2short, [z, #19] + lsr y2, y2, #8 + strb y2short, [z, #20] + lsr y2, y2, #8 + strb y2short, [z, #21] + lsr y2, y2, #8 + strb y2short, [z, #22] + lsr y2, y2, #8 + strb y2short, [z, #23] + + strb y3short, [z, #24] + lsr y3, y3, #8 + strb y3short, [z, #25] + lsr y3, y3, #8 + strb y3short, [z, #26] + lsr y3, y3, #8 + strb y3short, [z, #27] + lsr y3, y3, #8 + strb y3short, [z, #28] + lsr y3, y3, #8 + strb y3short, [z, #29] + lsr y3, y3, #8 + strb y3short, [z, #30] + lsr y3, y3, #8 + strb y3short, [z, #31] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/x86_att/curve25519/edwards25519_encode.S b/x86_att/curve25519/edwards25519_encode.S new file mode 100644 index 0000000000..bdbaa47232 --- /dev/null +++ b/x86_att/curve25519/edwards25519_encode.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Encode edwards25519 point into compressed form as 256-bit number +// Input p[8]; output z[32] (bytes) +// +// extern void edwards25519_encode +// (uint8_t z[static 32], uint64_t p[static 8]); +// +// This assumes that the input buffer p points to a pair of 256-bit +// numbers x (at p) and y (at p+4) representing a point (x,y) on the +// edwards25519 curve. It is assumed that both x and y are < p_25519 +// but there is no checking of this, nor of the fact that (x,y) is +// in fact on the curve. +// +// The output in z is a little-endian array of bytes corresponding to +// the standard compressed encoding of a point as 2^255 * x_0 + y +// where x_0 is the least significant bit of x. +// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2" +// In this implementation, y is simply truncated to 255 bits, but if +// it is reduced mod p_25519 as expected this does not affect values. +// +// Standard x86-64 ABI: RDI = z, RSI = p +// Microsoft x64 ABI: RCX = z, RDX = p +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_encode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_encode) + .text + +#define z %rdi +#define p %rsi +#define y0 %rax +#define y1 %rcx +#define y2 %rdx +#define y3 %r8 +#define xb %r9 + +S2N_BN_SYMBOL(edwards25519_encode): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load lowest word of x coordinate in xb and full y as [y3;y2;y1;y0]. + + movq (p), xb + movq 32(p), y0 + movq 40(p), y1 + movq 48(p), y2 + movq 56(p), y3 + +// Compute the encoded form, making the LSB of x the MSB of the encoding + + btr $63, y3 + shlq $63, xb + orq xb, y3 + +// Store back (by the word, since x86 is little-endian anyway) + + movq y0, (z) + movq y1, 8(z) + movq y2, 16(z) + movq y3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From 755bc0b45e7e0a136f076ecf82fae9b9013cfe9c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 13 Oct 2023 20:45:51 -0700 Subject: [PATCH 03/24] Add Ed25519 point decoding function This implements point decoding from a 256-bit little-endian byte sequence to a point (x,y) on the edwards25519 curve as specified in https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.3 The function returns 0 for success and 1 for failure, the latter meaning that the input is not the encoding of any edwards25519 point. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/97f74932f775695ecd4fe897343067a2592f7c4b --- arm/curve25519/edwards25519_decode.S | 700 +++++++++++++++++ arm/curve25519/edwards25519_decode_alt.S | 563 ++++++++++++++ x86_att/curve25519/edwards25519_decode.S | 670 +++++++++++++++++ x86_att/curve25519/edwards25519_decode_alt.S | 751 +++++++++++++++++++ 4 files changed, 2684 insertions(+) create mode 100644 arm/curve25519/edwards25519_decode.S create mode 100644 arm/curve25519/edwards25519_decode_alt.S create mode 100644 x86_att/curve25519/edwards25519_decode.S create mode 100644 x86_att/curve25519/edwards25519_decode_alt.S diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S new file mode 100644 index 0000000000..9161768db7 --- /dev/null +++ b/arm/curve25519/edwards25519_decode.S @@ -0,0 +1,700 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard ARM ABI: X0 = z, X1 = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y sp, #0 +#define s sp, #(4*N) +#define t sp, #(8*N) +#define u sp, #(12*N) +#define v sp, #(16*N) +#define w sp, #(20*N) + +// Other temporary variables in register + +#define res x19 +#define sgnbit x20 +#define badun x21 + +// Total size to reserve on the stack + +#define NSPACE #(24*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest; \ + add x1, src1; \ + add x2, src2; \ + bl edwards25519_decode_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest; \ + mov x1, n; \ + add x2, src; \ + bl edwards25519_decode_nsqr_p25519 + +S2N_BN_SYMBOL(edwards25519_decode): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Load the inputs, using byte operations in case of big-endian setting. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + ldrb w0, [x1] + lsl x4, x0, #56 + ldrb w0, [x1, #1] + extr x4, x0, x4, #8 + ldrb w0, [x1, #2] + extr x4, x0, x4, #8 + ldrb w0, [x1, #3] + extr x4, x0, x4, #8 + ldrb w0, [x1, #4] + extr x4, x0, x4, #8 + ldrb w0, [x1, #5] + extr x4, x0, x4, #8 + ldrb w0, [x1, #6] + extr x4, x0, x4, #8 + ldrb w0, [x1, #7] + extr x4, x0, x4, #8 + + ldrb w0, [x1, #8] + lsl x5, x0, #56 + ldrb w0, [x1, #9] + extr x5, x0, x5, #8 + ldrb w0, [x1, #10] + extr x5, x0, x5, #8 + ldrb w0, [x1, #11] + extr x5, x0, x5, #8 + ldrb w0, [x1, #12] + extr x5, x0, x5, #8 + ldrb w0, [x1, #13] + extr x5, x0, x5, #8 + ldrb w0, [x1, #14] + extr x5, x0, x5, #8 + ldrb w0, [x1, #15] + extr x5, x0, x5, #8 + + ldrb w0, [x1, #16] + lsl x6, x0, #56 + ldrb w0, [x1, #17] + extr x6, x0, x6, #8 + ldrb w0, [x1, #18] + extr x6, x0, x6, #8 + ldrb w0, [x1, #19] + extr x6, x0, x6, #8 + ldrb w0, [x1, #20] + extr x6, x0, x6, #8 + ldrb w0, [x1, #21] + extr x6, x0, x6, #8 + ldrb w0, [x1, #22] + extr x6, x0, x6, #8 + ldrb w0, [x1, #23] + extr x6, x0, x6, #8 + + ldrb w0, [x1, #24] + lsl x7, x0, #56 + ldrb w0, [x1, #25] + extr x7, x0, x7, #8 + ldrb w0, [x1, #26] + extr x7, x0, x7, #8 + ldrb w0, [x1, #27] + extr x7, x0, x7, #8 + ldrb w0, [x1, #28] + extr x7, x0, x7, #8 + ldrb w0, [x1, #29] + extr x7, x0, x7, #8 + ldrb w0, [x1, #30] + extr x7, x0, x7, #8 + ldrb w0, [x1, #31] + extr x7, x0, x7, #8 + + stp x4, x5, [y] + lsr sgnbit, x7, #63 + and x7, x7, #0x7FFFFFFFFFFFFFFF + stp x6, x7, [y+16] + + adds xzr, x4, #19 + adcs xzr, x5, xzr + adcs xzr, x6, xzr + adcs xzr, x7, xzr + cset badun, mi + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + nsqr(v,1,y) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + mov x4, #0x8000000000000000 + subs x0, x0, #20 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, x4 + stp x0, x1, [u] + stp x2, x3, [u+16] + + movbig(x0,#0x75eb,#0x4dca,#0x1359,#0x78a3) + movbig(x1,#0x0070,#0x0a4d,#0x4141,#0xd8ab) + movbig(x2,#0x8cc7,#0x4079,#0x7779,#0xe898) + movbig(x3,#0x5203,#0x6cee,#0x2b6f,#0xfe73) + stp x0, x1, [w] + stp x2, x3, [w+16] + mulp(v,w,v) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + adds x0, x0, #1 + adcs x1, x1, xzr + adcs x2, x2, xzr + adcs x3, x3, xzr + stp x0, x1, [v] + stp x2, x3, [v+16] + + mulp(w,u,v) + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + nsqr(t,1,w) + mulp(t,t,w) + nsqr(s,2,t) + mulp(t,s,t) + nsqr(s,1,t) + mulp(v,s,w) + nsqr(s,5,v) + mulp(t,s,v) + nsqr(s,10,t) + mulp(t,s,t) + nsqr(s,5,t) + mulp(v,s,v) + nsqr(s,25,v) + mulp(t,s,v) + nsqr(s,50,t) + mulp(t,s,t) + nsqr(s,25,t) + mulp(v,s,v) + nsqr(s,125,v) + mulp(v,s,v) + nsqr(s,2,v) + mulp(s,s,w) + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + nsqr(v,1,s) + mulp(v,v,w) + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + mulp(s,u,s) + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// x4 = 0 <=> s^2 * w = 0 or 1 + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + bic x4, x0, #1 + orr x4, x4, x1 + orr x5, x2, x3 + orr x4, x4, x5 + +// x0 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + add x0, x0, #20 + add x1, x1, #1 + orr x0, x0, x1 + add x2, x2, #1 + eor x3, x3, #0x7FFFFFFFFFFFFFFF + orr x2, x2, x3 + orr x0, x0, x2 + +// If s^2 * w is not 0 or 1 then replace s by t + + cmp x4, xzr + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + stp x10, x11, [s] + stp x12, x13, [s+16] + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + ccmp x0, xzr, 4, ne + cset x0, ne + orr badun, badun, x0 + +// Let [x3;x2;x1;x0] = s and [x7;x6;x5;x4] = p_25519 - s + + ldp x0, x1, [s] + ldp x2, x3, [s+16] + mov x4, #-19 + subs x4, x4, x0 + mov x6, #-1 + sbcs x5, x6, x1 + sbcs x6, x6, x2 + mov x7, #0x7FFFFFFFFFFFFFFF + sbc x7, x7, x3 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + and x9, x0, #1 + eor sgnbit, x9, sgnbit + orr x8, x0, x1 + orr x9, x2, x3 + orr x8, x8, x9 + orr x10, badun, sgnbit + cmp x8, xzr + csel badun, x10, badun, eq + ccmp sgnbit, xzr, #4, ne + +// Actual selection of x as s or -s, copying of y and return of validity + + csel x0, x0, x4, eq + csel x1, x1, x5, eq + csel x2, x2, x6, eq + csel x3, x3, x7, eq + ldp x8, x9, [y] + ldp x10, x11, [y+16] + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x8, x9, [res, #32] + stp x10, x11, [res, #48] + + mov x0, badun + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_mul_p25519: + ldp x3, x4, [x1] + ldp x5, x6, [x2] + umull x7, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x8, w16, w17 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [x1, #16] + ldp x5, x6, [x2, #16] + umull x11, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x12, w16, w17 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [x1, #16] + ldp x15, x16, [x1] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, lo + ldp x15, x17, [x2] + subs x5, x15, x5 + sbcs x6, x17, x6 + csetm x17, lo + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x17 + subs x5, x5, x17 + eor x6, x6, x17 + sbc x6, x6, x17 + eor x16, x17, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x17, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x9, lo + adds x15, x15, x17 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, lo + cinv x9, x9, lo + mul x5, x4, x6 + umulh x6, x4, x6 + adds x17, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #1 + eor x5, x5, x9 + adcs x17, x5, x17 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x17, x17, x16 + adcs x10, x17, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #38 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x17, x14, #31 + mov x5, #19 + umaddl x5, w5, w17, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x17, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #19 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x0] + stp x9, x10, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_nsqr_p25519: + +// Copy input argument into [x13;x12;x11;x10] + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + +// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_loop: + umull x2, w10, w10 + lsr x14, x10, #32 + umull x3, w14, w14 + umull x14, w10, w14 + adds x2, x2, x14, lsl #33 + lsr x14, x14, #31 + adc x3, x3, x14 + umull x4, w11, w11 + lsr x14, x11, #32 + umull x5, w14, w14 + umull x14, w11, w14 + mul x15, x10, x11 + umulh x16, x10, x11 + adds x4, x4, x14, lsl #33 + lsr x14, x14, #31 + adc x5, x5, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x5, x5, xzr + adds x3, x3, x15 + adcs x4, x4, x16 + adc x5, x5, xzr + umull x6, w12, w12 + lsr x14, x12, #32 + umull x7, w14, w14 + umull x14, w12, w14 + adds x6, x6, x14, lsl #33 + lsr x14, x14, #31 + adc x7, x7, x14 + umull x8, w13, w13 + lsr x14, x13, #32 + umull x9, w14, w14 + umull x14, w13, w14 + mul x15, x12, x13 + umulh x16, x12, x13 + adds x8, x8, x14, lsl #33 + lsr x14, x14, #31 + adc x9, x9, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x9, x9, xzr + adds x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, xzr + subs x10, x10, x12 + sbcs x11, x11, x13 + csetm x16, lo + eor x10, x10, x16 + subs x10, x10, x16 + eor x11, x11, x16 + sbc x11, x11, x16 + adds x6, x6, x4 + adcs x7, x7, x5 + adcs x8, x8, xzr + adc x9, x9, xzr + umull x12, w10, w10 + lsr x5, x10, #32 + umull x13, w5, w5 + umull x5, w10, w5 + adds x12, x12, x5, lsl #33 + lsr x5, x5, #31 + adc x13, x13, x5 + umull x15, w11, w11 + lsr x5, x11, #32 + umull x14, w5, w5 + umull x5, w11, w5 + mul x4, x10, x11 + umulh x16, x10, x11 + adds x15, x15, x5, lsl #33 + lsr x5, x5, #31 + adc x14, x14, x5 + adds x4, x4, x4 + adcs x16, x16, x16 + adc x14, x14, xzr + adds x13, x13, x4 + adcs x15, x15, x16 + adc x14, x14, xzr + adds x4, x2, x6 + adcs x5, x3, x7 + adcs x6, x6, x8 + adcs x7, x7, x9 + csetm x16, lo + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x15 + sbcs x7, x7, x14 + adcs x8, x8, x16 + adc x9, x9, x16 + mov x10, #38 + umull x12, w6, w10 + add x12, x12, w2, uxtw + lsr x2, x2, #32 + lsr x6, x6, #32 + umaddl x6, w6, w10, x2 + mov x2, x12 + umull x12, w7, w10 + add x12, x12, w3, uxtw + lsr x3, x3, #32 + lsr x7, x7, #32 + umaddl x7, w7, w10, x3 + mov x3, x12 + umull x12, w8, w10 + add x12, x12, w4, uxtw + lsr x4, x4, #32 + lsr x8, x8, #32 + umaddl x8, w8, w10, x4 + mov x4, x12 + umull x12, w9, w10 + add x12, x12, w5, uxtw + lsr x5, x5, #32 + lsr x9, x9, #32 + umaddl x9, w9, w10, x5 + mov x5, x12 + lsr x13, x9, #31 + mov x11, #19 + umull x11, w11, w13 + add x2, x2, x11 + adds x10, x2, x6, lsl #32 + extr x12, x7, x6, #32 + adcs x11, x3, x12 + extr x12, x8, x7, #32 + adcs x12, x4, x12 + extr x14, x9, x8, #32 + lsl x15, x13, #63 + eor x5, x5, x15 + adc x13, x5, x14 + +// Loop as applicable + + subs x1, x1, #1 + bne edwards25519_decode_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x10, #19 + adcs x7, x11, xzr + adcs x8, x12, xzr + adcs x9, x13, xzr + + csel x10, x10, x6, pl + csel x11, x11, x7, pl + csel x12, x12, x8, pl + csel x13, x13, x9, pl + bic x13, x13, #0x8000000000000000 + +// Copy result back into destination and return + + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S new file mode 100644 index 0000000000..c77a191744 --- /dev/null +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -0,0 +1,563 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard ARM ABI: X0 = z, X1 = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode_alt) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y sp, #0 +#define s sp, #(4*N) +#define t sp, #(8*N) +#define u sp, #(12*N) +#define v sp, #(16*N) +#define w sp, #(20*N) + +// Other temporary variables in register + +#define res x19 +#define sgnbit x20 +#define badun x21 + +// Total size to reserve on the stack + +#define NSPACE #(24*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest; \ + add x1, src1; \ + add x2, src2; \ + bl edwards25519_decode_alt_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest; \ + mov x1, n; \ + add x2, src; \ + bl edwards25519_decode_alt_nsqr_p25519 + +S2N_BN_SYMBOL(edwards25519_decode_alt): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Load the inputs, using byte operations in case of big-endian setting. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + ldrb w0, [x1] + lsl x4, x0, #56 + ldrb w0, [x1, #1] + extr x4, x0, x4, #8 + ldrb w0, [x1, #2] + extr x4, x0, x4, #8 + ldrb w0, [x1, #3] + extr x4, x0, x4, #8 + ldrb w0, [x1, #4] + extr x4, x0, x4, #8 + ldrb w0, [x1, #5] + extr x4, x0, x4, #8 + ldrb w0, [x1, #6] + extr x4, x0, x4, #8 + ldrb w0, [x1, #7] + extr x4, x0, x4, #8 + + ldrb w0, [x1, #8] + lsl x5, x0, #56 + ldrb w0, [x1, #9] + extr x5, x0, x5, #8 + ldrb w0, [x1, #10] + extr x5, x0, x5, #8 + ldrb w0, [x1, #11] + extr x5, x0, x5, #8 + ldrb w0, [x1, #12] + extr x5, x0, x5, #8 + ldrb w0, [x1, #13] + extr x5, x0, x5, #8 + ldrb w0, [x1, #14] + extr x5, x0, x5, #8 + ldrb w0, [x1, #15] + extr x5, x0, x5, #8 + + ldrb w0, [x1, #16] + lsl x6, x0, #56 + ldrb w0, [x1, #17] + extr x6, x0, x6, #8 + ldrb w0, [x1, #18] + extr x6, x0, x6, #8 + ldrb w0, [x1, #19] + extr x6, x0, x6, #8 + ldrb w0, [x1, #20] + extr x6, x0, x6, #8 + ldrb w0, [x1, #21] + extr x6, x0, x6, #8 + ldrb w0, [x1, #22] + extr x6, x0, x6, #8 + ldrb w0, [x1, #23] + extr x6, x0, x6, #8 + + ldrb w0, [x1, #24] + lsl x7, x0, #56 + ldrb w0, [x1, #25] + extr x7, x0, x7, #8 + ldrb w0, [x1, #26] + extr x7, x0, x7, #8 + ldrb w0, [x1, #27] + extr x7, x0, x7, #8 + ldrb w0, [x1, #28] + extr x7, x0, x7, #8 + ldrb w0, [x1, #29] + extr x7, x0, x7, #8 + ldrb w0, [x1, #30] + extr x7, x0, x7, #8 + ldrb w0, [x1, #31] + extr x7, x0, x7, #8 + + stp x4, x5, [y] + lsr sgnbit, x7, #63 + and x7, x7, #0x7FFFFFFFFFFFFFFF + stp x6, x7, [y+16] + + adds xzr, x4, #19 + adcs xzr, x5, xzr + adcs xzr, x6, xzr + adcs xzr, x7, xzr + cset badun, mi + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + nsqr(v,1,y) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + mov x4, #0x8000000000000000 + subs x0, x0, #20 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x3, x3, x4 + stp x0, x1, [u] + stp x2, x3, [u+16] + + movbig(x0,#0x75eb,#0x4dca,#0x1359,#0x78a3) + movbig(x1,#0x0070,#0x0a4d,#0x4141,#0xd8ab) + movbig(x2,#0x8cc7,#0x4079,#0x7779,#0xe898) + movbig(x3,#0x5203,#0x6cee,#0x2b6f,#0xfe73) + stp x0, x1, [w] + stp x2, x3, [w+16] + mulp(v,w,v) + ldp x0, x1, [v] + ldp x2, x3, [v+16] + adds x0, x0, #1 + adcs x1, x1, xzr + adcs x2, x2, xzr + adcs x3, x3, xzr + stp x0, x1, [v] + stp x2, x3, [v+16] + + mulp(w,u,v) + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + nsqr(t,1,w) + mulp(t,t,w) + nsqr(s,2,t) + mulp(t,s,t) + nsqr(s,1,t) + mulp(v,s,w) + nsqr(s,5,v) + mulp(t,s,v) + nsqr(s,10,t) + mulp(t,s,t) + nsqr(s,5,t) + mulp(v,s,v) + nsqr(s,25,v) + mulp(t,s,v) + nsqr(s,50,t) + mulp(t,s,t) + nsqr(s,25,t) + mulp(v,s,v) + nsqr(s,125,v) + mulp(v,s,v) + nsqr(s,2,v) + mulp(s,s,w) + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + nsqr(v,1,s) + mulp(v,v,w) + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + mulp(s,u,s) + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// x4 = 0 <=> s^2 * w = 0 or 1 + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + bic x4, x0, #1 + orr x4, x4, x1 + orr x5, x2, x3 + orr x4, x4, x5 + +// x0 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + add x0, x0, #20 + add x1, x1, #1 + orr x0, x0, x1 + add x2, x2, #1 + eor x3, x3, #0x7FFFFFFFFFFFFFFF + orr x2, x2, x3 + orr x0, x0, x2 + +// If s^2 * w is not 0 or 1 then replace s by t + + cmp x4, xzr + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + stp x10, x11, [s] + stp x12, x13, [s+16] + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + ccmp x0, xzr, 4, ne + cset x0, ne + orr badun, badun, x0 + +// Let [x3;x2;x1;x0] = s and [x7;x6;x5;x4] = p_25519 - s + + ldp x0, x1, [s] + ldp x2, x3, [s+16] + mov x4, #-19 + subs x4, x4, x0 + mov x6, #-1 + sbcs x5, x6, x1 + sbcs x6, x6, x2 + mov x7, #0x7FFFFFFFFFFFFFFF + sbc x7, x7, x3 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + and x9, x0, #1 + eor sgnbit, x9, sgnbit + orr x8, x0, x1 + orr x9, x2, x3 + orr x8, x8, x9 + orr x10, badun, sgnbit + cmp x8, xzr + csel badun, x10, badun, eq + ccmp sgnbit, xzr, #4, ne + +// Actual selection of x as s or -s, copying of y and return of validity + + csel x0, x0, x4, eq + csel x1, x1, x5, eq + csel x2, x2, x6, eq + csel x3, x3, x7, eq + ldp x8, x9, [y] + ldp x10, x11, [y+16] + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x8, x9, [res, #32] + stp x10, x11, [res, #48] + + mov x0, badun + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_alt_mul_p25519: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + mov x7, #38 + mul x11, x7, x16 + umulh x9, x7, x16 + adds x12, x12, x11 + mul x11, x7, x3 + umulh x3, x7, x3 + adcs x13, x13, x11 + mul x11, x7, x4 + umulh x4, x7, x4 + adcs x14, x14, x11 + mul x11, x7, x5 + umulh x5, x7, x5 + adcs x15, x15, x11 + cset x16, hs + adds x15, x15, x4 + adc x16, x16, x5 + cmn x15, x15 + orr x15, x15, #0x8000000000000000 + adc x8, x16, x16 + mov x7, #19 + madd x11, x7, x8, x7 + adds x12, x12, x11 + adcs x13, x13, x9 + adcs x14, x14, x3 + adcs x15, x15, xzr + csel x7, x7, xzr, lo + subs x12, x12, x7 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + and x15, x15, #0x7fffffffffffffff + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_alt_nsqr_p25519: + +// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20 + + ldp x6, x3, [x2] + ldp x4, x5, [x2, #16] + mov x2, x6 + +// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_alt_loop: + mul x9, x2, x3 + umulh x10, x2, x3 + mul x11, x2, x5 + umulh x12, x2, x5 + mul x7, x2, x4 + umulh x6, x2, x4 + adds x10, x10, x7 + adcs x11, x11, x6 + mul x7, x3, x4 + umulh x6, x3, x4 + adc x6, x6, xzr + adds x11, x11, x7 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x6 + mul x7, x3, x5 + umulh x6, x3, x5 + adc x6, x6, xzr + adds x12, x12, x7 + adcs x13, x13, x6 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x6, hs + umulh x7, x2, x2 + mul x8, x2, x2 + adds x9, x9, x7 + mul x7, x3, x3 + adcs x10, x10, x7 + umulh x7, x3, x3 + adcs x11, x11, x7 + mul x7, x4, x4 + adcs x12, x12, x7 + umulh x7, x4, x4 + adcs x13, x13, x7 + mul x7, x5, x5 + adcs x14, x14, x7 + umulh x7, x5, x5 + adc x6, x6, x7 + mov x3, #38 + mul x7, x3, x12 + umulh x4, x3, x12 + adds x8, x8, x7 + mul x7, x3, x13 + umulh x13, x3, x13 + adcs x9, x9, x7 + mul x7, x3, x14 + umulh x14, x3, x14 + adcs x10, x10, x7 + mul x7, x3, x6 + umulh x6, x3, x6 + adcs x11, x11, x7 + cset x12, hs + adds x11, x11, x14 + adc x12, x12, x6 + cmn x11, x11 + bic x11, x11, #0x8000000000000000 + adc x2, x12, x12 + mov x3, #0x13 + mul x7, x3, x2 + adds x2, x8, x7 + adcs x3, x9, x4 + adcs x4, x10, x13 + adc x5, x11, xzr + +// Loop as applicable + + subs x1, x1, #1 + bne edwards25519_decode_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x2, #19 + adcs x7, x3, xzr + adcs x8, x4, xzr + adcs x9, x5, xzr + + csel x2, x2, x6, pl + csel x3, x3, x7, pl + csel x4, x4, x8, pl + csel x5, x5, x9, pl + bic x5, x5, #0x8000000000000000 + +// Copy result back into destination and return + + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S new file mode 100644 index 0000000000..05681925a3 --- /dev/null +++ b/x86_att/curve25519/edwards25519_decode.S @@ -0,0 +1,670 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard x86-64 ABI: RDI = z, RSI = c +// Microsoft x64 ABI: RCX = z, RDX = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y 0(%rsp) +#define s (4*N)(%rsp) +#define t (8*N)(%rsp) +#define u (12*N)(%rsp) +#define v (16*N)(%rsp) +#define w (20*N)(%rsp) +#define q (24*N)(%rsp) +#define res (28*N)(%rsp) +#define sgnbit (29*N)(%rsp) +#define badun (30*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (32*N) + +// Corrupted versions when stack is down 8 more + +#define q8 (25*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define Y 0 +#define S (4*N) +#define T (8*N) +#define U (12*N) +#define V (16*N) +#define W (20*N) +#define Q8 (25*N) + +S2N_BN_SYMBOL(edwards25519_decode): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq edwards25519_decode_standard + popq %rsi + popq %rdi + ret + +edwards25519_decode_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Load the inputs, which can be done word-wise since x86 is little-endian. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + movq (%rsi), %rax + movq %rax, Y(%rsp) + movq 8(%rsi), %rbx + movq %rbx, Y+8(%rsp) + xorl %ebp, %ebp + movq 16(%rsi), %rcx + movq %rcx, Y+16(%rsp) + movq 24(%rsi), %rdx + btr $63, %rdx + movq %rdx, Y+24(%rsp) + adcq %rbp, %rbp + movq %rbp, sgnbit + + addq $19, %rax + adcq $0, %rbx + adcq $0, %rcx + adcq $0, %rdx + shrq $63, %rdx + movq %rdx, badun + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq Y(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + movq V(%rsp), %rax + subq $20, %rax + movq V+8(%rsp), %rbx + sbbq $0, %rbx + movq V+16(%rsp), %rcx + sbbq $0, %rcx + movq V+24(%rsp), %rdx + sbbq $0, %rdx + btc $63, %rdx + movq %rax, U(%rsp) + movq %rbx, U+8(%rsp) + movq %rcx, U+16(%rsp) + movq %rdx, U+24(%rsp) + + movq $0x75eb4dca135978a3, %rax + movq %rax, W(%rsp) + movq $0x00700a4d4141d8ab, %rax + movq %rax, W+8(%rsp) + movq $0x8cc740797779e898, %rax + movq %rax, W+16(%rsp) + movq $0x52036cee2b6ffe73, %rax + movq %rax, W+24(%rsp) + leaq V(%rsp), %rdi + leaq W(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + movq V(%rsp), %rax + addq $1, %rax + movq V+8(%rsp), %rbx + adcq $0, %rbx + movq V+16(%rsp), %rcx + adcq $0, %rcx + movq V+24(%rsp), %rdx + adcq $0, %rdx + movq %rax, V(%rsp) + movq %rbx, V+8(%rsp) + movq %rcx, V+16(%rsp) + movq %rdx, V+24(%rsp) + + leaq W(%rsp), %rdi + leaq U(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq V(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + leaq S(%rsp), %rdi + leaq U(%rsp), %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_mul_p25519 + +// %rax = 0 <=> s^2 * w = 0 or 1 + + movq V(%rsp), %r8 + movq V+8(%rsp), %r9 + movq V+16(%rsp), %r10 + movq V+24(%rsp), %r11 + movl $1, %eax + notq %rax + andq %r8, %rax + orq %r9, %rax + orq %r10, %rax + orq %r11, %rax + +// %r8 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + addq $20, %r8 + notq %r9 + notq %r10 + bts $63, %r11 + addq $1, %r11 + orq %r9, %r8 + orq %r11, %r10 + orq %r10, %r8 + +// If s^2 * w is not 0 or 1 then replace s by t + + testq %rax, %rax + + movq S(%rsp), %r12 + movq T(%rsp), %rbx + cmovnzq %rbx, %r12 + movq S+8(%rsp), %r13 + movq T+8(%rsp), %rbx + cmovnzq %rbx, %r13 + movq S+16(%rsp), %r14 + movq T+16(%rsp), %rbx + cmovnzq %rbx, %r14 + movq S+24(%rsp), %r15 + movq T+24(%rsp), %rbx + cmovnzq %rbx, %r15 + movq %r12, S(%rsp) + movq %r13, S+8(%rsp) + movq %r14, S+16(%rsp) + movq %r15, S+24(%rsp) + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + cmovzq %rax, %r8 + negq %r8 + sbbq %r8, %r8 + negq %r8 + orq %r8, badun + +// Let [%r11;%r10;%r9;%r8] = s and [%r15;%r14;%r13;%r12] = p_25519 - s + + movq S(%rsp), %r8 + movq $-19, %r12 + subq %r8, %r12 + movq S+8(%rsp), %r9 + movq $-1, %r13 + sbbq %r9, %r13 + movq S+16(%rsp), %r10 + movq $-1, %r14 + sbbq %r10, %r14 + movq S+24(%rsp), %r11 + movq $0x7FFFFFFFFFFFFFFF, %r15 + sbbq %r11, %r15 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + movl $1, %ecx + andq %r8, %rcx + xorq sgnbit, %rcx + movq badun, %rdx + movq %rdx, %rsi + orq %rcx, %rdx + xorl %ebp, %ebp + movq %r8, %rax + movq %r9, %rbx + orq %r10, %rax + orq %r11, %rbx + orq %rbx, %rax + cmovzq %rbp, %rcx + cmovnzq %rsi, %rdx + +// Actual selection of x as s or -s, copying of y and return of validity + + testq %rcx, %rcx + + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq Y(%rsp), %rcx + movq %rcx, 32(%rdi) + movq Y+8(%rsp), %rcx + movq %rcx, 40(%rdi) + movq Y+16(%rsp), %rcx + movq %rcx, 48(%rdi) + movq Y+24(%rsp), %rcx + movq %rcx, 56(%rdi) + + movq %rdx, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_mul_p25519: + movq %rdx, %rcx + xorl %ebp, %ebp + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rbp, %r12 + xorl %ebp, %ebp + movq 0x8(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + xorl %ebp, %ebp + movq 0x10(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + movq 0x18(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rcx, %r15 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movl $0x26, %edx + mulxq %r15, %rax, %rbx + adcxq %rcx, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + addq %r11, %rax + adcq %rbp, %rbx + btq $0x3f, %rax + adcq %rbx, %rbx + leaq 0x1(%rbx), %rcx + imulq $0x13, %rcx, %rcx + xorl %ebp, %ebp + adoxq %rcx, %r8 + mulxq %r12, %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq %r13, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r14, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq %r15, %rax, %rbx + adcq %rax, %r11 + shlq $0x3f, %rcx + cmpq %rcx, %r11 + movl $0x13, %eax + cmovns %rbp, %rax + subq %rax, %r8 + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rbp, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_nsqr_p25519: + +// Copy input argument into q + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, Q8(%rsp) + movq %rbx, Q8+8(%rsp) + movq %rcx, Q8+16(%rsp) + movq %rdx, Q8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_loop: + movq Q8(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq Q8+0x8(%rsp), %r9, %r10 + mulxq Q8+0x18(%rsp), %r11, %r12 + movq Q8+0x10(%rsp), %rdx + mulxq Q8+0x18(%rsp), %r13, %r14 + xorl %ebx, %ebx + mulxq Q8(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq Q8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq Q8+0x18(%rsp), %rdx + mulxq Q8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rbx, %r13 + adoxq %rbx, %r14 + adcq %rbx, %r14 + xorl %ebx, %ebx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq Q8+0x8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq Q8+0x10(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq Q8+0x18(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbx, %r15 + adoxq %rbx, %r15 + movl $0x26, %edx + xorl %ebx, %ebx + mulxq %r12, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq %r13, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r14, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq %r15, %rax, %r12 + adcxq %rax, %r11 + adoxq %rbx, %r12 + adcxq %rbx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rbx, %r9 + adcq %rbx, %r10 + adcq %rbx, %r11 + movq %r8, Q8(%rsp) + movq %r9, Q8+0x8(%rsp) + movq %r10, Q8+0x10(%rsp) + movq %r11, Q8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz edwards25519_decode_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S new file mode 100644 index 0000000000..570b2f9081 --- /dev/null +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -0,0 +1,751 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +// +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// +// This interprets the input byte string as a little-endian number +// representing a point (x,y) on the edwards25519 curve, encoded as +// 2^255 * x_0 + y where x_0 is the least significant bit of x. It +// returns the full pair of coordinates x (at z) and y (at z+4). The +// return code is 0 for success and 1 for failure, which means that +// the input does not correspond to the encoding of any edwards25519 +// point. This can happen for three reasons, where y = the lowest +// 255 bits of the input: +// +// * y >= p_25519 +// Input y coordinate is not reduced +// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root +// There is no x such that (x,y) is on the curve +// * y^2 = 1 and top bit of input is set +// Cannot be the canonical encoding of (0,1) or (0,-1) +// +// Standard x86-64 ABI: RDI = z, RSI = c +// Microsoft x64 ABI: RCX = z, RDX = c +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_decode_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_decode_alt) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define y 0(%rsp) +#define s (4*N)(%rsp) +#define t (8*N)(%rsp) +#define u (12*N)(%rsp) +#define v (16*N)(%rsp) +#define w (20*N)(%rsp) +#define q (24*N)(%rsp) +#define res (28*N)(%rsp) +#define sgnbit (29*N)(%rsp) +#define badun (30*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (32*N) + +// Corrupted versions when stack is down 8 more + +#define q8 (25*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define Y 0 +#define S (4*N) +#define T (8*N) +#define U (12*N) +#define V (16*N) +#define W (20*N) +#define Q8 (25*N) + +S2N_BN_SYMBOL(edwards25519_decode_alt): + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq edwards25519_decode_alt_standard + popq %rsi + popq %rdi + ret + +edwards25519_decode_alt_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Load the inputs, which can be done word-wise since x86 is little-endian. +// Let y be the lowest 255 bits of the input and sgnbit the desired parity. +// If y >= p_25519 then already flag the input as invalid (badun = 1). + + movq (%rsi), %rax + movq %rax, Y(%rsp) + movq 8(%rsi), %rbx + movq %rbx, Y+8(%rsp) + xorl %ebp, %ebp + movq 16(%rsi), %rcx + movq %rcx, Y+16(%rsp) + movq 24(%rsi), %rdx + btr $63, %rdx + movq %rdx, Y+24(%rsp) + adcq %rbp, %rbp + movq %rbp, sgnbit + + addq $19, %rax + adcq $0, %rbx + adcq $0, %rcx + adcq $0, %rdx + shrq $63, %rdx + movq %rdx, badun + +// u = y^2 - 1 (actually y + 2^255-20, not reduced modulo) +// v = 1 + d * y^2 (not reduced modulo from the +1) +// w = u * v + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq Y(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + movq V(%rsp), %rax + subq $20, %rax + movq V+8(%rsp), %rbx + sbbq $0, %rbx + movq V+16(%rsp), %rcx + sbbq $0, %rcx + movq V+24(%rsp), %rdx + sbbq $0, %rdx + btc $63, %rdx + movq %rax, U(%rsp) + movq %rbx, U+8(%rsp) + movq %rcx, U+16(%rsp) + movq %rdx, U+24(%rsp) + + movq $0x75eb4dca135978a3, %rax + movq %rax, W(%rsp) + movq $0x00700a4d4141d8ab, %rax + movq %rax, W+8(%rsp) + movq $0x8cc740797779e898, %rax + movq %rax, W+16(%rsp) + movq $0x52036cee2b6ffe73, %rax + movq %rax, W+24(%rsp) + leaq V(%rsp), %rdi + leaq W(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + movq V(%rsp), %rax + addq $1, %rax + movq V+8(%rsp), %rbx + adcq $0, %rbx + movq V+16(%rsp), %rcx + adcq $0, %rcx + movq V+24(%rsp), %rdx + adcq $0, %rdx + movq %rax, V(%rsp) + movq %rbx, V+8(%rsp) + movq %rcx, V+16(%rsp) + movq %rdx, V+24(%rsp) + + leaq W(%rsp), %rdi + leaq U(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Get s = w^{252-3} as a candidate inverse square root 1/sqrt(w). +// This power tower computation is the same as bignum_invsqrt_p25519 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq S(%rsp), %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq V(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Compute v' = s^2 * w to discriminate whether the square root sqrt(u/v) +// exists, in which case we should get 0, 1 or -1. + + leaq V(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_alt_nsqr_p25519 + + leaq V(%rsp), %rdi + leaq V(%rsp), %rsi + leaq W(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// Get the two candidates for sqrt(u / v), one being s = u * w^{252-3} +// and the other being t = s * j_25519 where j_25519 = sqrt(-1). + + leaq S(%rsp), %rdi + leaq U(%rsp), %rsi + leaq S(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq edwards25519_decode_alt_mul_p25519 + +// %rax = 0 <=> s^2 * w = 0 or 1 + + movq V(%rsp), %r8 + movq V+8(%rsp), %r9 + movq V+16(%rsp), %r10 + movq V+24(%rsp), %r11 + movl $1, %eax + notq %rax + andq %r8, %rax + orq %r9, %rax + orq %r10, %rax + orq %r11, %rax + +// %r8 = 0 <=> s^2 * w = -1 (mod p_25519, i.e. s^2 * w = 2^255 - 20) + + addq $20, %r8 + notq %r9 + notq %r10 + bts $63, %r11 + addq $1, %r11 + orq %r9, %r8 + orq %r11, %r10 + orq %r10, %r8 + +// If s^2 * w is not 0 or 1 then replace s by t + + testq %rax, %rax + + movq S(%rsp), %r12 + movq T(%rsp), %rbx + cmovnzq %rbx, %r12 + movq S+8(%rsp), %r13 + movq T+8(%rsp), %rbx + cmovnzq %rbx, %r13 + movq S+16(%rsp), %r14 + movq T+16(%rsp), %rbx + cmovnzq %rbx, %r14 + movq S+24(%rsp), %r15 + movq T+24(%rsp), %rbx + cmovnzq %rbx, %r15 + movq %r12, S(%rsp) + movq %r13, S+8(%rsp) + movq %r14, S+16(%rsp) + movq %r15, S+24(%rsp) + +// Check invalidity, occurring if s^2 * w is not in {0,1,-1} + + cmovzq %rax, %r8 + negq %r8 + sbbq %r8, %r8 + negq %r8 + orq %r8, badun + +// Let [%r11;%r10;%r9;%r8] = s and [%r15;%r14;%r13;%r12] = p_25519 - s + + movq S(%rsp), %r8 + movq $-19, %r12 + subq %r8, %r12 + movq S+8(%rsp), %r9 + movq $-1, %r13 + sbbq %r9, %r13 + movq S+16(%rsp), %r10 + movq $-1, %r14 + sbbq %r10, %r14 + movq S+24(%rsp), %r11 + movq $0x7FFFFFFFFFFFFFFF, %r15 + sbbq %r11, %r15 + +// Decide whether a flip is apparently indicated, s_0 <=> sgnbit +// Decide also if s = 0 by OR-ing its digits. Now if a flip is indicated: +// - if s = 0 then mark as invalid +// - if s <> 0 then indeed flip + + movl $1, %ecx + andq %r8, %rcx + xorq sgnbit, %rcx + movq badun, %rdx + movq %rdx, %rsi + orq %rcx, %rdx + xorl %ebp, %ebp + movq %r8, %rax + movq %r9, %rbx + orq %r10, %rax + orq %r11, %rbx + orq %rbx, %rax + cmovzq %rbp, %rcx + cmovnzq %rsi, %rdx + +// Actual selection of x as s or -s, copying of y and return of validity + + testq %rcx, %rcx + + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq Y(%rsp), %rcx + movq %rcx, 32(%rdi) + movq Y+8(%rsp), %rcx + movq %rcx, 40(%rdi) + movq Y+16(%rsp), %rcx + movq %rcx, 48(%rdi) + movq Y+24(%rsp), %rcx + movq %rcx, 56(%rdi) + + movq %rdx, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +edwards25519_decode_alt_mul_p25519: + movq %rdx, %rcx + movq (%rsi), %rax + mulq (%rcx) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq (%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq (%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq (%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %esi + movq %r12, %rax + mulq %rsi + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rsi + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + leaq 0x1(%r12), %rax + movl $0x13, %esi + bts $0x3f, %r11 + imulq %rsi, %rax + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + sbbq %rax, %rax + notq %rax + andq %rsi, %rax + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rcx, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +edwards25519_decode_alt_nsqr_p25519: + +// Copy input argument into q + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, Q8(%rsp) + movq %rbx, Q8+8(%rsp) + movq %rcx, Q8+16(%rsp) + movq %rdx, Q8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +edwards25519_decode_alt_loop: + movq Q8(%rsp), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq Q8(%rsp), %rax + mulq Q8+0x8(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq Q8+0x8(%rsp), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq Q8(%rsp), %rax + mulq Q8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq Q8(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq Q8+0x8(%rsp), %rax + mulq Q8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq Q8+0x8(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq Q8+0x10(%rsp), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq Q8+0x10(%rsp), %rax + mulq Q8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq Q8+0x18(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %ebx + movq %r12, %rax + mulq %rbx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rbx + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + movq %r8, Q8(%rsp) + movq %r9, Q8+0x8(%rsp) + movq %r10, Q8+0x10(%rsp) + movq %r11, Q8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz edwards25519_decode_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif From d9d1d380f603a9a8a9f9a9dae4f824c749e9565f Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 17 Oct 2023 17:36:25 -0700 Subject: [PATCH 04/24] Add generic size curve25519/edwards25519 basepoint modulus The function bignum_mod_n25519 performs reduction of an input of any size (k digits) modulo the order of the curve25519/edwards25519 basepoint, n_25519 = 2^252 + 27742317777372353535851937790883648493. It generalizes bignum_mod_n25519_4, which is the special case of 4-digit (256-bit) inputs. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/e23fd300aab6a28455133d495b458074d5d810f1 --- arm/curve25519/bignum_mod_n25519.S | 186 ++++++++++++++++++++ x86_att/curve25519/bignum_mod_n25519.S | 228 +++++++++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 arm/curve25519/bignum_mod_n25519.S create mode 100644 x86_att/curve25519/bignum_mod_n25519.S diff --git a/arm/curve25519/bignum_mod_n25519.S b/arm/curve25519/bignum_mod_n25519.S new file mode 100644 index 0000000000..5a256ed133 --- /dev/null +++ b/arm/curve25519/bignum_mod_n25519.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n25519 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint, +// which is n_25519 = 2^252 + 27742317777372353535851937790883648493 +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 + +#define n0 x11 +#define n1 x12 + +// These two are aliased: we only load d when finished with q + +#define q x13 +#define d x13 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n25519): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 +// This [m3;m2;m1;m0] is the initial x where we begin reduction. + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated two words of n_25519 = 2^252 + [n1; n0] + + movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed) + movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6) + +// Get the quotient estimate q = floor(x/2^252). +// Also delete it from m3, in effect doing x' = x - q * 2^252 + + lsr q, m3, #60 + and m3, m3, #0x0FFFFFFFFFFFFFFF + +// Multiply [t2;t1;t0] = q * [n1;n0] + + mul t0, n0, q + mul t1, n1, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr + +// Subtract [m3;m2;m1;m0] = x' - q * [n1;n0] = x - q * n_25519 + + subs m0, m0, t0 + sbcs m1, m1, t1 + sbcs m2, m2, t2 + sbcs m3, m3, xzr + +// If this borrows (CF = 0 because of inversion), add back n_25519. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + csel t0, n0, xzr, cc + csel t1, n1, xzr, cc + adds m0, m0, t0 + adcs m1, m1, t1 + and t0, t0, #0x1000000000000000 + adcs m2, m2, xzr + adc m3, m3, t0 + +// Now do (k-4) iterations of 5->4 word modular reduction. Each one +// is similar to the sequence above except for the more refined quotient +// estimation process. + + cbz k, writeback + +loop: + +// Assume that the new 5-digit x is 2^64 * previous_x + next_digit. +// Get the quotient estimate q = max (floor(x/2^252)) (2^64 - 1) +// and first compute x' = x - 2^252 * q. + + extr q, m3, m2, #60 + and m2, m2, #0x0FFFFFFFFFFFFFFF + sub q, q, m3, lsr #60 + and m3, m3, #0xF000000000000000 + add m2, m2, m3 + +// Multiply [t2;t1;t0] = q * [n1;n0] + + mul t0, n0, q + mul t1, n1, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr + +// Decrement k and load the next digit (note that d aliases to q) + + sub k, k, #1 + ldr d, [x, k, lsl #3] + +// Subtract [t3;t2;t1;t0] = x' - q * [n1;n0] = x - q * n_25519 + + subs t0, d, t0 + sbcs t1, m0, t1 + sbcs t2, m1, t2 + sbcs t3, m2, xzr + +// If this borrows (CF = 0 because of inversion), add back n_25519. +// The masked n3 digit exploits the fact that bit 60 of n1 is set. + + csel m0, n0, xzr, cc + csel m1, n1, xzr, cc + adds m0, t0, m0 + and m3, m1, #0x1000000000000000 + adcs m1, t1, m1 + adcs m2, t2, xzr + adc m3, t3, m3 + + cbnz k, loop + +// Finally write back [m3;m2;m1;m0] and return + +writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, writeback + ldr m0, [x] + subs k, k, #1 + beq writeback + ldr m1, [x, #8] + subs k, k, #1 + beq writeback + ldr m2, [x, #16] + b writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/x86_att/curve25519/bignum_mod_n25519.S b/x86_att/curve25519/bignum_mod_n25519.S new file mode 100644 index 0000000000..c45d99b541 --- /dev/null +++ b/x86_att/curve25519/bignum_mod_n25519.S @@ -0,0 +1,228 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n25519 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint, +// which is n_25519 = 2^252 + 27742317777372353535851937790883648493 +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define q %rbx + +S2N_BN_SYMBOL(bignum_mod_n25519): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %rbp + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 +// This [m3;m2;m1;m0] is the initial x where we begin reduction. + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies + + movq %rdx, x + +// Get the quotient estimate q = floor(x/2^252). +// Also delete it from m3, in effect doing x' = x - q * 2^252 + + movq m3, q + shrq $60, q + + shlq $4, m3 + shrq $4, m3 + +// Let [%rdx;d;%rbp] = q * (n_25519 - 2^252) + + movq $0x5812631a5cf5d3ed, %rax + mulq q + movq %rax, %rbp + movq %rdx, d + + movq $0x14def9dea2f79cd6, %rax + mulq q + addq %rax, d + adcq $0, %rdx + +// Subtract to get x' - q * (n_25519 - 2^252) = x - q * n_25519 + + subq %rbp, m0 + sbbq d, m1 + sbbq %rdx, m2 + sbbq $0, m3 + +// Get a bitmask for the borrow and create a masked version of +// non-trivial digits of [%rbx;0;%rdx;%rax] = n_25519, then add it. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + sbbq %rbx, %rbx + + movq $0x5812631a5cf5d3ed, %rax + andq %rbx, %rax + movq $0x14def9dea2f79cd6, %rdx + andq %rbx, %rdx + movq $0x1000000000000000, %rbx + andq %rax, %rbx + + addq %rax, m0 + adcq %rdx, m1 + adcq $0, m2 + adcq %rbx, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction. Each one +// is similar to the sequence above except for the more refined quotient +// estimation process. + + testq k, k + jz writeback + +loop: + +// Assume that the new 5-digit x is 2^64 * previous_x + next_digit. +// Get the quotient estimate q = max (floor(x/2^252)) (2^64 - 1) +// and first compute x' = x - 2^252 * q. + + movq m3, q + shldq $4, m2, q + shrq $60, m3 + subq m3, q + shlq $4, m2 + shrdq $4, m3, m2 + +// Let [%rdx;m3;%rbp] = q * (n_25519 - 2^252) + + movq $0x5812631a5cf5d3ed, %rax + mulq q + movq %rax, %rbp + movq %rdx, m3 + + movq $0x14def9dea2f79cd6, %rax + mulq q + addq %rax, m3 + adcq $0, %rdx + +// Load the next digit + + movq -8(x,k,8), d + +// Subtract to get x' - q * (n_25519 - 2^252) = x - q * n_25519 + + subq %rbp, d + sbbq m3, m0 + sbbq %rdx, m1 + sbbq $0, m2 + +// Get a bitmask for the borrow and create a masked version of +// non-trivial digits of [%rbx;0;%rdx;%rax] = n_25519, then add it. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + sbbq %rbx, %rbx + + movq $0x5812631a5cf5d3ed, %rax + andq %rbx, %rax + movq $0x14def9dea2f79cd6, %rdx + andq %rbx, %rdx + movq $0x1000000000000000, %rbx + andq %rax, %rbx + + addq %rax, d + adcq %rdx, m0 + adcq $0, m1 + adcq %rbx, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz loop + +// Write back + +writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz writeback + movq (%rdx), m0 + decq k + jz writeback + movq 8(%rdx), m1 + decq k + jz writeback + movq 16(%rdx), m2 + jmp writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From ee9dd974a9bc16b96221d0d40b404c9d9cdc14e6 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 27 Oct 2023 20:14:40 -0700 Subject: [PATCH 05/24] Switch curve25519 operations to divstep-based modular inverse This replaces the inlined variant of "bignum_modinv" with code from "bignum_inv_p25519" in all "curve25519_" functions returning an affine point and hence using modular inverse. There are also a few consequential changes related to the slightly different amount of temporary storage needed by this function. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/777d5745b1b71f8c311d32bb922399653ffb8df3 --- arm/curve25519/curve25519_x25519.S | 1392 +++++++++---- arm/curve25519/curve25519_x25519_alt.S | 1392 +++++++++---- arm/curve25519/curve25519_x25519_byte.S | 1392 +++++++++---- arm/curve25519/curve25519_x25519_byte_alt.S | 1392 +++++++++---- arm/curve25519/curve25519_x25519base.S | 1394 +++++++++---- arm/curve25519/curve25519_x25519base_alt.S | 1394 +++++++++---- arm/curve25519/curve25519_x25519base_byte.S | 1394 +++++++++---- .../curve25519_x25519base_byte_alt.S | 1397 +++++++++---- x86_att/curve25519/curve25519_x25519.S | 1754 +++++++++++++---- x86_att/curve25519/curve25519_x25519_alt.S | 1754 +++++++++++++---- x86_att/curve25519/curve25519_x25519base.S | 1744 ++++++++++++---- .../curve25519/curve25519_x25519base_alt.S | 1750 ++++++++++++---- 12 files changed, 13721 insertions(+), 4428 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index d66884d5d4..26d96c2617 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -849,356 +849,1046 @@ curve25519_x25519_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_wmontend -curve25519_x25519_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wmontloop -curve25519_x25519_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_zmontend -curve25519_x25519_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zmontloop -curve25519_x25519_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_negskip1 -curve25519_x25519_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_negloop1 -curve25519_x25519_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_negskip2 -curve25519_x25519_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_negloop2 -curve25519_x25519_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_invmidloop +curve25519_x25519_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1221,7 +1911,7 @@ curve25519_x25519_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/arm/curve25519/curve25519_x25519_alt.S b/arm/curve25519/curve25519_x25519_alt.S index 4e9b91b48e..858e74185b 100644 --- a/arm/curve25519/curve25519_x25519_alt.S +++ b/arm/curve25519/curve25519_x25519_alt.S @@ -633,356 +633,1046 @@ curve25519_x25519_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_alt_wmontend -curve25519_x25519_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wmontloop -curve25519_x25519_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_alt_zmontend -curve25519_x25519_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zmontloop -curve25519_x25519_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_alt_negskip1 -curve25519_x25519_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_alt_negloop1 -curve25519_x25519_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_alt_negskip2 -curve25519_x25519_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_alt_negloop2 -curve25519_x25519_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_alt_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_alt_invmidloop +curve25519_x25519_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1005,7 +1695,7 @@ curve25519_x25519_alt_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index d64eb73ed2..ede1bd1ee2 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -967,356 +967,1046 @@ curve25519_x25519_byte_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_byte_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_byte_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_byte_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_byte_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_wmontend -curve25519_x25519_byte_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wmontloop -curve25519_x25519_byte_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_byte_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_zmontend -curve25519_x25519_byte_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zmontloop -curve25519_x25519_byte_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_byte_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_negskip1 -curve25519_x25519_byte_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_negloop1 -curve25519_x25519_byte_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_negskip2 -curve25519_x25519_byte_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_negloop2 -curve25519_x25519_byte_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_byte_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_byte_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_byte_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_byte_invmidloop +curve25519_x25519_byte_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_byte_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_byte_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1339,7 +2029,7 @@ curve25519_x25519_byte_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(zn,xn,zm) + mul_p25519(zn,xn,xm) ldp x10, x11, [zn] strb w10, [resx] diff --git a/arm/curve25519/curve25519_x25519_byte_alt.S b/arm/curve25519/curve25519_x25519_byte_alt.S index 7f79cfd803..03211203cf 100644 --- a/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/arm/curve25519/curve25519_x25519_byte_alt.S @@ -751,356 +751,1046 @@ curve25519_x25519_byte_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - mov x0, #-19 - mov x1, #-1 - mov x2, #0x7fffffffffffffff - stp x0, x1, [sn] - stp x1, x2, [sn+16] - -// Prepare to call the modular inverse function to get zm = 1/zn - - mov x0, #4 - add x1, zm - add x2, zn - add x3, sn - add x4, p - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519_byte_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519_byte_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519_byte_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519_byte_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519_byte_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_alt_wmontend -curve25519_x25519_byte_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wmontloop -curve25519_x25519_byte_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519_byte_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519_byte_alt_zmontend -curve25519_x25519_byte_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zmontloop -curve25519_x25519_byte_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519_byte_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519_byte_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519_byte_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519_byte_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_alt_negskip1 -curve25519_x25519_byte_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_alt_negloop1 -curve25519_x25519_byte_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519_byte_alt_negskip2 -curve25519_x25519_byte_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519_byte_alt_negloop2 -curve25519_x25519_byte_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519_byte_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519_byte_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519_byte_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519_byte_alt_outerloop +// Prepare to call the modular inverse function to get xm = 1/zn + + add x0, xm + add x1, zn + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519_byte_alt_invmidloop +curve25519_x25519_byte_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519_byte_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519_byte_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a @@ -1123,7 +1813,7 @@ curve25519_x25519_byte_alt_zfliploop: // Now the result is xn * (1/zn), fully reduced modulo p. - mul_p25519(zn,xn,zm) + mul_p25519(zn,xn,xm) ldp x10, x11, [zn] strb w10, [resx] diff --git a/arm/curve25519/curve25519_x25519base.S b/arm/curve25519/curve25519_x25519base.S index 030fa08e24..b9c3b8e34a 100644 --- a/arm/curve25519/curve25519_x25519base.S +++ b/arm/curve25519/curve25519_x25519base.S @@ -907,360 +907,1058 @@ curve25519_x25519base_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_wmontend -curve25519_x25519base_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wmontloop -curve25519_x25519base_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_zmontend -curve25519_x25519base_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zmontloop -curve25519_x25519base_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_negskip1 -curve25519_x25519base_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_negloop1 -curve25519_x25519base_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_negskip2 -curve25519_x25519base_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_negloop2 -curve25519_x25519base_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_invmidloop +curve25519_x25519base_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1279,14 +1977,6 @@ curve25519_x25519base_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_alt.S b/arm/curve25519/curve25519_x25519base_alt.S index 97d2e9c54f..22de69f4c3 100644 --- a/arm/curve25519/curve25519_x25519base_alt.S +++ b/arm/curve25519/curve25519_x25519base_alt.S @@ -749,360 +749,1058 @@ curve25519_x25519base_alt_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_alt_wmontend -curve25519_x25519base_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wmontloop -curve25519_x25519base_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_alt_zmontend -curve25519_x25519base_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zmontloop -curve25519_x25519base_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_alt_negskip1 -curve25519_x25519base_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_alt_negloop1 -curve25519_x25519base_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_alt_negskip2 -curve25519_x25519base_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_alt_negloop2 -curve25519_x25519base_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_alt_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_alt_invmidloop +curve25519_x25519base_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1121,14 +1819,6 @@ curve25519_x25519base_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_byte.S b/arm/curve25519/curve25519_x25519base_byte.S index b6d95f58c9..aecc693c66 100644 --- a/arm/curve25519/curve25519_x25519base_byte.S +++ b/arm/curve25519/curve25519_x25519base_byte.S @@ -966,360 +966,1058 @@ curve25519_x25519base_byte_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_byte_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_byte_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_byte_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_byte_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_byte_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_wmontend -curve25519_x25519base_byte_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wmontloop -curve25519_x25519base_byte_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_byte_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_zmontend -curve25519_x25519base_byte_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zmontloop -curve25519_x25519base_byte_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_byte_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_negskip1 -curve25519_x25519base_byte_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_negloop1 -curve25519_x25519base_byte_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_negskip2 -curve25519_x25519base_byte_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_negloop2 -curve25519_x25519base_byte_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_byte_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_byte_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_byte_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_byte_invmidloop +curve25519_x25519base_byte_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_byte_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_byte_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(x_1,y_3,x_3) + mul_p25519(x_1,t1,t0) ldp x10, x11, [x_1] strb w10, [resx] @@ -1405,14 +2103,6 @@ curve25519_x25519base_byte_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_byte_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/curve25519_x25519base_byte_alt.S b/arm/curve25519/curve25519_x25519base_byte_alt.S index 6e61199732..9c9dca518c 100644 --- a/arm/curve25519/curve25519_x25519base_byte_alt.S +++ b/arm/curve25519/curve25519_x25519base_byte_alt.S @@ -805,363 +805,1059 @@ curve25519_x25519base_byte_alt_scalarloop: // the Montgomery point at infinity, and Edwards (0,-1) which maps to // Montgomery (0,0) [this is the 2-torsion point] are both by definition // mapped to 0 by the X coordinate mapping used to define curve25519. -// -// First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - mov x0, 4 - add x1, x_3 - add x2, z_3 - adr x3, curve25519_x25519base_byte_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -curve25519_x25519base_byte_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -curve25519_x25519base_byte_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -curve25519_x25519base_byte_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, curve25519_x25519base_byte_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc curve25519_x25519base_byte_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_alt_wmontend -curve25519_x25519base_byte_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wmontloop -curve25519_x25519base_byte_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -curve25519_x25519base_byte_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, curve25519_x25519base_byte_alt_zmontend -curve25519_x25519base_byte_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zmontloop -curve25519_x25519base_byte_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -curve25519_x25519base_byte_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -curve25519_x25519base_byte_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -curve25519_x25519base_byte_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc curve25519_x25519base_byte_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_alt_negskip1 -curve25519_x25519base_byte_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_alt_negloop1 -curve25519_x25519base_byte_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, curve25519_x25519base_byte_alt_negskip2 -curve25519_x25519base_byte_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, curve25519_x25519base_byte_alt_negloop2 -curve25519_x25519base_byte_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -curve25519_x25519base_byte_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -curve25519_x25519base_byte_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, curve25519_x25519base_byte_alt_zfliploop - subs x2, x2, #0x3a - b.hi curve25519_x25519base_byte_alt_outerloop + add x0, t0 + add x1, t2 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b curve25519_x25519base_byte_alt_invmidloop +curve25519_x25519base_byte_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +curve25519_x25519base_byte_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne curve25519_x25519base_byte_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that // fully reduces modulo p_25519 since now we want the canonical // answer as output. - mul_p25519(x_1,y_3,x_3) + mul_p25519(x_1,t1,t0) ldp x10, x11, [x_1] strb w10, [resx] @@ -1229,6 +1925,7 @@ curve25519_x25519base_byte_alt_zfliploop: lsr x13, x13, #8 strb w13, [resx+31] + // Restore stack and registers add sp, sp, #NSPACE @@ -1246,14 +1943,6 @@ curve25519_x25519base_byte_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -curve25519_x25519base_byte_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/x86_att/curve25519/curve25519_x25519.S b/x86_att/curve25519/curve25519_x25519.S index 2a97ee9407..4a8351eaa3 100644 --- a/x86_att/curve25519/curve25519_x25519.S +++ b/x86_att/curve25519/curve25519_x25519.S @@ -65,12 +65,12 @@ #define sn (4*NUMSIZE)(%rsp) -#define zn (5*NUMSIZE)(%rsp) #define dn (5*NUMSIZE)(%rsp) #define e (5*NUMSIZE)(%rsp) #define dmsn (6*NUMSIZE)(%rsp) #define p (6*NUMSIZE)(%rsp) +#define zn (7*NUMSIZE)(%rsp) #define xm (8*NUMSIZE)(%rsp) #define dnsm (8*NUMSIZE)(%rsp) @@ -790,430 +790,1372 @@ curve25519_x25519_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - movq $-19, %rax - movq $-1, %rcx - movq $0x7fffffffffffffff, %rdx - movq %rax, 128(%rsp) - movq %rcx, 136(%rsp) - movq %rcx, 144(%rsp) - movq %rdx, 152(%rsp) - -// Prepare to call the modular inverse function to get zm = 1/zn - - movq $4, %rdi - leaq 96(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq 128(%rsp), %rcx - leaq 192(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites pointx, scalar and dm, which are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp +// Prepare to call the modular inverse function to get xm = 1/zn + + leaq 256(%rsp), %rdi + leaq 224(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_wmontend -curve25519_x25519_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519_wmontloop -curve25519_x25519_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_zmontend -curve25519_x25519_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519_zmontloop -curve25519_x25519_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a // dependency on the behavior of modular inverse in out-of-scope cases. - movq 160(%rsp), %rax - orq 168(%rsp), %rax - orq 176(%rsp), %rax - orq 184(%rsp), %rax + movq 224(%rsp), %rax + orq 232(%rsp), %rax + orq 240(%rsp), %rax + orq 248(%rsp), %rax movq 320(%rsp), %rcx cmovzq %rax, %rcx movq %rcx, 320(%rsp) @@ -1230,7 +2172,7 @@ curve25519_x25519_fliploop: // Now the result is xn * (1/zn), fully reduced modulo p. movq res, %rbp - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/x86_att/curve25519/curve25519_x25519_alt.S b/x86_att/curve25519/curve25519_x25519_alt.S index 241c4505af..a855478a6e 100644 --- a/x86_att/curve25519/curve25519_x25519_alt.S +++ b/x86_att/curve25519/curve25519_x25519_alt.S @@ -65,12 +65,12 @@ #define sn (4*NUMSIZE)(%rsp) -#define zn (5*NUMSIZE)(%rsp) #define dn (5*NUMSIZE)(%rsp) #define e (5*NUMSIZE)(%rsp) #define dmsn (6*NUMSIZE)(%rsp) #define p (6*NUMSIZE)(%rsp) +#define zn (7*NUMSIZE)(%rsp) #define xm (8*NUMSIZE)(%rsp) #define dnsm (8*NUMSIZE)(%rsp) @@ -951,430 +951,1372 @@ curve25519_x25519_alt_scalarloop: mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// First set up the constant sn = 2^255 - 19 for the modular inverse. - - movq $-19, %rax - movq $-1, %rcx - movq $0x7fffffffffffffff, %rdx - movq %rax, 128(%rsp) - movq %rcx, 136(%rsp) - movq %rcx, 144(%rsp) - movq %rdx, 152(%rsp) - -// Prepare to call the modular inverse function to get zm = 1/zn - - movq $4, %rdi - leaq 96(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq 128(%rsp), %rcx - leaq 192(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites pointx, scalar and dm, which are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp +// Prepare to call the modular inverse function to get xm = 1/zn + + leaq 256(%rsp), %rdi + leaq 224(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, xm and zn. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_alt_midloop +curve25519_x25519_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_alt_wmontend -curve25519_x25519_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519_alt_wmontloop -curve25519_x25519_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519_alt_zmontend -curve25519_x25519_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519_alt_zmontloop -curve25519_x25519_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a // dependency on the behavior of modular inverse in out-of-scope cases. - movq 160(%rsp), %rax - orq 168(%rsp), %rax - orq 176(%rsp), %rax - orq 184(%rsp), %rax + movq 224(%rsp), %rax + orq 232(%rsp), %rax + orq 240(%rsp), %rax + orq 248(%rsp), %rax movq 320(%rsp), %rcx cmovzq %rax, %rcx movq %rcx, 320(%rsp) @@ -1391,7 +2333,7 @@ curve25519_x25519_alt_fliploop: // Now the result is xn * (1/zn), fully reduced modulo p. movq res, %rbp - mul_p25519(resx,xn,zm) + mul_p25519(resx,xn,xm) // Restore stack and registers diff --git a/x86_att/curve25519/curve25519_x25519base.S b/x86_att/curve25519/curve25519_x25519base.S index 12a5cddd18..e450656861 100644 --- a/x86_att/curve25519/curve25519_x25519base.S +++ b/x86_att/curve25519/curve25519_x25519base.S @@ -874,416 +874,1368 @@ curve25519_x25519base_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) -// Prepare to call the modular inverse function to get x_3 = 1/z_3 +// Prepare to call the modular inverse function to get t0 = 1/t2 // Note that this works for the weakly normalized z_3 equally well. // The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. - movq $4, %rdi - leaq 128(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq curve25519_x25519base_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq 256(%rsp), %rdi + leaq 320(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519base_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519base_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519base_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519base_midloop +curve25519_x25519base_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519base_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_wmontend -curve25519_x25519base_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519base_wmontloop -curve25519_x25519base_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_zmontend -curve25519_x25519base_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519base_zmontloop -curve25519_x25519base_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519base_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519base_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519base_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519base_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519base_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519base_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519base_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519base_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that @@ -1291,7 +2243,7 @@ curve25519_x25519base_fliploop: // answer as output. movq res, %rbp - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1313,14 +2265,6 @@ curve25519_x25519base_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -curve25519_x25519base_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/x86_att/curve25519/curve25519_x25519base_alt.S b/x86_att/curve25519/curve25519_x25519base_alt.S index 8a89b1f597..b1275e2084 100644 --- a/x86_att/curve25519/curve25519_x25519base_alt.S +++ b/x86_att/curve25519/curve25519_x25519base_alt.S @@ -950,414 +950,1368 @@ curve25519_x25519base_alt_scalarloop: // // First the addition and subtraction: - add_twice4(y_3,x_3,w_3) - sub_twice4(z_3,x_3,w_3) - -// Prepare to call the modular inverse function to get x_3 = 1/z_3 - - movq $4, %rdi - leaq 128(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq curve25519_x25519base_alt_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + add_twice4(t1,x_3,w_3) + sub_twice4(t2,x_3,w_3) + +// Prepare to call the modular inverse function to get t0 = 1/t2 +// Note that this works for the weakly normalized z_3 equally well. +// The non-coprime case z_3 == 0 (mod p_25519) cannot arise anyway. + + leaq 256(%rsp), %rdi + leaq 320(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, t0, t1, t2. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -curve25519_x25519base_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -curve25519_x25519base_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -curve25519_x25519base_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519base_alt_midloop +curve25519_x25519base_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne curve25519_x25519base_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_alt_wmontend -curve25519_x25519base_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_wmontloop -curve25519_x25519base_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je curve25519_x25519base_alt_zmontend -curve25519_x25519base_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_zmontloop -curve25519_x25519base_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne curve25519_x25519base_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -curve25519_x25519base_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -curve25519_x25519base_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519base_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -curve25519_x25519base_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb curve25519_x25519base_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -curve25519_x25519base_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne curve25519_x25519base_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -curve25519_x25519base_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb curve25519_x25519base_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja curve25519_x25519base_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519base_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is (X + T) / (X - T) // This is the only operation in the whole computation that @@ -1365,7 +2319,7 @@ curve25519_x25519base_alt_fliploop: // answer as output. movq res, %rbp - mul_p25519(resx,y_3,x_3) + mul_p25519(resx,t1,t0) // Restore stack and registers @@ -1387,14 +2341,6 @@ curve25519_x25519base_alt_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -curve25519_x25519base_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 2^254 * G and (2^254 + 8) * G in extended-projective coordinates // but with z = 1 assumed and hence left out, so they are (X,Y,T) only. From 8fd6e1a87a97738ecf5ff58cdbd56c1bc0c8258c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 1 Nov 2023 18:52:23 -0700 Subject: [PATCH 06/24] Switch edwards25519 operations to divstep-based modular inverse This replaces the inlined variant of "bignum_modinv" with code from "bignum_inv_p25519" in all "edwards25519_scalarmul*" functions. Again, there are consequential changes related to the slightly different amount of temporary storage needed by bignum_inv_p25519. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/7e7b18e8fc83fa25131cfac1c94bd83fbf6cd243 --- arm/curve25519/edwards25519_scalarmulbase.S | 1387 +++++++++--- .../edwards25519_scalarmulbase_alt.S | 1387 +++++++++--- arm/curve25519/edwards25519_scalarmuldouble.S | 1401 +++++++++--- .../edwards25519_scalarmuldouble_alt.S | 1401 +++++++++--- .../curve25519/edwards25519_scalarmulbase.S | 1877 +++++++++++---- .../edwards25519_scalarmulbase_alt.S | 1877 +++++++++++---- .../curve25519/edwards25519_scalarmuldouble.S | 2005 ++++++++++++----- .../edwards25519_scalarmuldouble_alt.S | 2005 ++++++++++++----- 8 files changed, 9956 insertions(+), 3384 deletions(-) diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 6ca092489f..8c9d0f9193 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -956,346 +956,1045 @@ edwards25519_scalarmulbase_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip1 -edwards25519_scalarmulbase_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop1 -edwards25519_scalarmulbase_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_negskip2 -edwards25519_scalarmulbase_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_negloop2 -edwards25519_scalarmulbase_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_invmidloop +edwards25519_scalarmulbase_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1322,14 +2021,6 @@ edwards25519_scalarmulbase_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index e8dd9114a4..03e5598f2c 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -798,346 +798,1045 @@ edwards25519_scalarmulbase_alt_scalarloop: // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - mov x0, 4 - add x1, w_3 - add x2, z_3 - adr x3, edwards25519_scalarmulbase_alt_p_25519 - add x4, tmpspace - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmulbase_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmulbase_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmulbase_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmulbase_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmulbase_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmulbase_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmulbase_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmulbase_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmulbase_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip1 -edwards25519_scalarmulbase_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop1 -edwards25519_scalarmulbase_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmulbase_alt_negskip2 -edwards25519_scalarmulbase_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmulbase_alt_negloop2 -edwards25519_scalarmulbase_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmulbase_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmulbase_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmulbase_alt_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmulbase_alt_outerloop + add x0, w_3 + add x1, z_3 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, w_3, x_3 +// and y_3. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmulbase_alt_invmidloop +edwards25519_scalarmulbase_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmulbase_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmulbase_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1164,14 +1863,6 @@ edwards25519_scalarmulbase_alt_zfliploop: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index cd760f1212..00ea37eaaf 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -57,14 +57,14 @@ #define scalar sp, #(0*NUMSIZE) #define bscalar sp, #(1*NUMSIZE) -#define acc sp, #(2*NUMSIZE) -#define acc_x sp, #(2*NUMSIZE) -#define acc_y sp, #(3*NUMSIZE) -#define acc_z sp, #(4*NUMSIZE) -#define acc_w sp, #(5*NUMSIZE) +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) -#define tabent sp, #(6*NUMSIZE) -#define btabent sp, #(10*NUMSIZE) +#define tabent sp, #(9*NUMSIZE) #define tab sp, #(13*NUMSIZE) @@ -1872,347 +1872,1044 @@ edwards25519_scalarmuldouble_loop: // Modular inverse setup - mov x0, #4 - add x1, tabent - add x2, acc+64 - adr x3, edwards25519_scalarmuldouble_p25519 - add x4, btabent - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - -edwards25519_scalarmuldouble_modinv: - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmuldouble_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmuldouble_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmuldouble_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmuldouble_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_wmontend -edwards25519_scalarmuldouble_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wmontloop -edwards25519_scalarmuldouble_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmuldouble_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_zmontend -edwards25519_scalarmuldouble_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zmontloop -edwards25519_scalarmuldouble_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmuldouble_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_negskip1 -edwards25519_scalarmuldouble_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_negloop1 -edwards25519_scalarmuldouble_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_negskip2 -edwards25519_scalarmuldouble_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_negloop2 -edwards25519_scalarmuldouble_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmuldouble_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmuldouble_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmuldouble_outerloop + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_invmidloop +edwards25519_scalarmuldouble_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Store result. Note that these are the only reductions mod 2^255-19 @@ -2330,14 +3027,6 @@ edwards25519_scalarmuldouble_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index c8fe77c31f..ad05eae1fb 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -57,14 +57,14 @@ #define scalar sp, #(0*NUMSIZE) #define bscalar sp, #(1*NUMSIZE) -#define acc sp, #(2*NUMSIZE) -#define acc_x sp, #(2*NUMSIZE) -#define acc_y sp, #(3*NUMSIZE) -#define acc_z sp, #(4*NUMSIZE) -#define acc_w sp, #(5*NUMSIZE) +#define btabent sp, #(2*NUMSIZE) +#define acc sp, #(5*NUMSIZE) +#define acc_x sp, #(5*NUMSIZE) +#define acc_y sp, #(6*NUMSIZE) +#define acc_z sp, #(7*NUMSIZE) +#define acc_w sp, #(8*NUMSIZE) -#define tabent sp, #(6*NUMSIZE) -#define btabent sp, #(10*NUMSIZE) +#define tabent sp, #(9*NUMSIZE) #define tab sp, #(13*NUMSIZE) @@ -1656,347 +1656,1044 @@ edwards25519_scalarmuldouble_alt_loop: // Modular inverse setup - mov x0, #4 - add x1, tabent - add x2, acc+64 - adr x3, edwards25519_scalarmuldouble_alt_p25519 - add x4, btabent - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "arm/generic/bignum_modinv.S". - -edwards25519_scalarmuldouble_alt_modinv: - lsl x10, x0, #3 - add x21, x4, x10 - add x22, x21, x10 - mov x10, xzr -edwards25519_scalarmuldouble_alt_copyloop: - ldr x11, [x2, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - str x11, [x21, x10, lsl #3] - str x12, [x22, x10, lsl #3] - str x12, [x4, x10, lsl #3] - str xzr, [x1, x10, lsl #3] - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_alt_copyloop - ldr x11, [x4] - sub x12, x11, #0x1 - str x12, [x4] - lsl x20, x11, #2 - sub x20, x11, x20 - eor x20, x20, #0x2 - mov x12, #0x1 - madd x12, x11, x20, x12 - mul x11, x12, x12 - madd x20, x12, x20, x20 - mul x12, x11, x11 - madd x20, x11, x20, x20 - mul x11, x12, x12 - madd x20, x12, x20, x20 - madd x20, x11, x20, x20 - lsl x2, x0, #7 -edwards25519_scalarmuldouble_alt_outerloop: - add x10, x2, #0x3f - lsr x5, x10, #6 - cmp x5, x0 - csel x5, x0, x5, cs - mov x13, xzr - mov x15, xzr - mov x14, xzr - mov x16, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_toploop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - orr x17, x11, x12 - cmp x17, xzr - and x17, x19, x13 - csel x15, x17, x15, ne - and x17, x19, x14 - csel x16, x17, x16, ne - csel x13, x11, x13, ne - csel x14, x12, x14, ne - csetm x19, ne - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_alt_toploop - orr x11, x13, x14 - clz x12, x11 - negs x17, x12 - lsl x13, x13, x12 - csel x15, x15, xzr, ne - lsl x14, x14, x12 - csel x16, x16, xzr, ne - lsr x15, x15, x17 - lsr x16, x16, x17 - orr x13, x13, x15 - orr x14, x14, x16 - ldr x15, [x21] - ldr x16, [x22] - mov x6, #0x1 - mov x7, xzr - mov x8, xzr - mov x9, #0x1 - mov x10, #0x3a - tst x15, #0x1 -edwards25519_scalarmuldouble_alt_innerloop: - csel x11, x14, xzr, ne - csel x12, x16, xzr, ne - csel x17, x8, xzr, ne - csel x19, x9, xzr, ne - ccmp x13, x14, #0x2, ne - sub x11, x13, x11 - sub x12, x15, x12 - csel x14, x14, x13, cs - cneg x11, x11, cc - csel x16, x16, x15, cs - cneg x15, x12, cc - csel x8, x8, x6, cs - csel x9, x9, x7, cs - tst x12, #0x2 - add x6, x6, x17 - add x7, x7, x19 - lsr x13, x11, #1 - lsr x15, x15, #1 - add x8, x8, x8 - add x9, x9, x9 - sub x10, x10, #0x1 - cbnz x10, edwards25519_scalarmuldouble_alt_innerloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_congloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - adds x15, x15, x16 - extr x17, x15, x17, #58 - str x17, [x4, x10, lsl #3] - mov x17, x15 - umulh x15, x7, x12 - adc x13, x13, x15 - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - adds x15, x15, x16 - extr x19, x15, x19, #58 - str x19, [x1, x10, lsl #3] - mov x19, x15 - umulh x15, x9, x12 - adc x14, x14, x15 - add x10, x10, #0x1 - cmp x10, x0 - b.cc edwards25519_scalarmuldouble_alt_congloop - extr x13, x13, x17, #58 - extr x14, x14, x19, #58 - ldr x11, [x4] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_alt_wmontend -edwards25519_scalarmuldouble_alt_wmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x4, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wmontloop -edwards25519_scalarmuldouble_alt_wmontend: - adcs x16, x16, x13 - adc x13, xzr, xzr - sub x15, x10, #0x1 - str x16, [x4, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_alt_wcmploop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wcmploop - sbcs xzr, x13, xzr - csetm x13, cs - negs x10, xzr -edwards25519_scalarmuldouble_alt_wcorrloop: - ldr x11, [x4, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x13 - sbcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wcorrloop - ldr x11, [x1] - mul x17, x11, x20 - ldr x12, [x3] - mul x15, x17, x12 - umulh x16, x17, x12 - adds x11, x11, x15 - mov x10, #0x1 - sub x11, x0, #0x1 - cbz x11, edwards25519_scalarmuldouble_alt_zmontend -edwards25519_scalarmuldouble_alt_zmontloop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - mul x15, x17, x11 - adcs x12, x12, x16 - umulh x16, x17, x11 - adc x16, x16, xzr - adds x12, x12, x15 - sub x15, x10, #0x1 - str x12, [x1, x15, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zmontloop -edwards25519_scalarmuldouble_alt_zmontend: - adcs x16, x16, x14 - adc x14, xzr, xzr - sub x15, x10, #0x1 - str x16, [x1, x15, lsl #3] - negs x10, xzr -edwards25519_scalarmuldouble_alt_zcmploop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - sbcs xzr, x11, x12 - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zcmploop - sbcs xzr, x14, xzr - csetm x14, cs - negs x10, xzr -edwards25519_scalarmuldouble_alt_zcorrloop: - ldr x11, [x1, x10, lsl #3] - ldr x12, [x3, x10, lsl #3] - and x12, x12, x14 - sbcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zcorrloop - mov x13, xzr - mov x14, xzr - mov x17, xzr - mov x19, xzr - mov x10, xzr -edwards25519_scalarmuldouble_alt_crossloop: - ldr x11, [x21, x10, lsl #3] - ldr x12, [x22, x10, lsl #3] - mul x15, x6, x11 - mul x16, x7, x12 - adds x15, x15, x13 - umulh x13, x6, x11 - adc x13, x13, xzr - subs x15, x15, x16 - str x15, [x21, x10, lsl #3] - umulh x15, x7, x12 - sub x17, x15, x17 - sbcs x13, x13, x17 - csetm x17, cc - mul x15, x8, x11 - mul x16, x9, x12 - adds x15, x15, x14 - umulh x14, x8, x11 - adc x14, x14, xzr - subs x15, x15, x16 - str x15, [x22, x10, lsl #3] - umulh x15, x9, x12 - sub x19, x15, x19 - sbcs x14, x14, x19 - csetm x19, cc - add x10, x10, #0x1 - cmp x10, x5 - b.cc edwards25519_scalarmuldouble_alt_crossloop - cmn x17, x17 - ldr x15, [x21] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_alt_negskip1 -edwards25519_scalarmuldouble_alt_negloop1: - add x11, x10, #0x8 - ldr x12, [x21, x11] - extr x15, x12, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_alt_negloop1 -edwards25519_scalarmuldouble_alt_negskip1: - extr x15, x13, x15, #58 - eor x15, x15, x17 - adcs x15, x15, xzr - str x15, [x21, x10] - cmn x19, x19 - ldr x15, [x22] - mov x10, xzr - sub x6, x5, #0x1 - cbz x6, edwards25519_scalarmuldouble_alt_negskip2 -edwards25519_scalarmuldouble_alt_negloop2: - add x11, x10, #0x8 - ldr x12, [x22, x11] - extr x15, x12, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x15, x12 - add x10, x10, #0x8 - sub x6, x6, #0x1 - cbnz x6, edwards25519_scalarmuldouble_alt_negloop2 -edwards25519_scalarmuldouble_alt_negskip2: - extr x15, x14, x15, #58 - eor x15, x15, x19 - adcs x15, x15, xzr - str x15, [x22, x10] - mov x10, xzr - cmn x17, x17 -edwards25519_scalarmuldouble_alt_wfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x4, x10, lsl #3] - and x11, x11, x17 - eor x12, x12, x17 - adcs x11, x11, x12 - str x11, [x4, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_wfliploop - mvn x19, x19 - mov x10, xzr - cmn x19, x19 -edwards25519_scalarmuldouble_alt_zfliploop: - ldr x11, [x3, x10, lsl #3] - ldr x12, [x1, x10, lsl #3] - and x11, x11, x19 - eor x12, x12, x19 - adcs x11, x11, x12 - str x11, [x1, x10, lsl #3] - add x10, x10, #0x1 - sub x11, x10, x0 - cbnz x11, edwards25519_scalarmuldouble_alt_zfliploop - subs x2, x2, #0x3a - b.hi edwards25519_scalarmuldouble_alt_outerloop + add x0, tabent + add x1, acc+64 + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 128 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, acc, tabent. + + mov x20, x0 + mov x10, #0xffffffffffffffed + mov x11, #0xffffffffffffffff + stp x10, x11, [sp] + mov x12, #0x7fffffffffffffff + stp x11, x12, [sp, #16] + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #0x13 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [sp, #32] + stp x4, x5, [sp, #48] + stp xzr, xzr, [sp, #64] + stp xzr, xzr, [sp, #80] + mov x10, #0x2099 + movk x10, #0x7502, lsl #16 + movk x10, #0x9e23, lsl #32 + movk x10, #0xa0f9, lsl #48 + mov x11, #0x2595 + movk x11, #0x1d13, lsl #16 + movk x11, #0x8f3f, lsl #32 + movk x11, #0xa8c6, lsl #48 + mov x12, #0x5242 + movk x12, #0x5ac, lsl #16 + movk x12, #0x8938, lsl #32 + movk x12, #0x6c6c, lsl #48 + mov x13, #0x615 + movk x13, #0x4177, lsl #16 + movk x13, #0x8b2, lsl #32 + movk x13, #0x2765, lsl #48 + stp x10, x11, [sp, #96] + stp x12, x13, [sp, #112] + mov x21, #0xa + mov x22, #0x1 + b edwards25519_scalarmuldouble_alt_invmidloop +edwards25519_scalarmuldouble_alt_invloop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #32] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #40] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #32] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #40] + ldr x7, [sp, #24] + eor x1, x7, x14 + asr x3, x1, #63 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + asr x0, x1, #63 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + eor x1, x7, x16 + asr x5, x1, #63 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + asr x0, x1, #63 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #48] + extr x2, x5, x2, #59 + str x2, [sp, #56] + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #96] + adc x3, x3, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #104] + adc x4, x4, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #112] + adc x2, x2, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + add x6, x6, x3, asr #63 + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [sp, #64] + stp x6, x5, [sp, #80] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x6, x5, x2, #63 + ldp x0, x1, [sp, #96] + add x6, x6, x5, asr #63 + mov x5, #0x13 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [sp, #112] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [sp, #96] + stp x3, x2, [sp, #112] +edwards25519_scalarmuldouble_alt_invmidloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #32] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne edwards25519_scalarmuldouble_alt_invloop + ldr x0, [sp] + ldr x1, [sp, #32] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #64] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #96] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #64] + adc x2, x2, x1 + ldr x7, [sp, #72] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #104] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #72] + adc x6, x6, x1 + ldr x7, [sp, #80] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #112] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #80] + adc x5, x5, x1 + ldr x7, [sp, #88] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #120] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x3, x5, #63 + ldp x0, x1, [sp, #64] + tst x3, x3 + cinc x6, x6, pl + mov x3, #0x13 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [sp, #80] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + mov x4, x20 + stp x0, x1, [x4] + stp x2, x5, [x4, #16] // Store result. Note that these are the only reductions mod 2^255-19 @@ -2114,14 +2811,6 @@ edwards25519_scalarmuldouble_alt_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_alt_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index a024c9daa4..c44e31724c 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519. @@ -337,12 +345,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_curve25519_x25519base_standard: +edwards25519_scalarmulbase_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -413,11 +421,11 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -428,75 +436,75 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -512,7 +520,7 @@ edwards25519_scalarmulbase_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -804,26 +812,26 @@ edwards25519_scalarmulbase_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -844,10 +852,10 @@ edwards25519_scalarmulbase_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -884,10 +892,10 @@ edwards25519_scalarmulbase_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -896,424 +904,1377 @@ edwards25519_scalarmulbase_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_midloop +edwards25519_scalarmulbase_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_wmontend -edwards25519_scalarmulbase_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wmontloop -edwards25519_scalarmulbase_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_zmontend -edwards25519_scalarmulbase_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zmontloop -edwards25519_scalarmulbase_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1344,18 +2305,10 @@ edwards25519_scalarmulbase_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_edwards25519_0g: +edwards25519_scalarmulbase_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1372,7 +2325,7 @@ edwards25519_scalarmulbase_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_edwards25519_251g: +edwards25519_scalarmulbase_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1390,7 +2343,7 @@ edwards25519_scalarmulbase_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_edwards25519_gtable: +edwards25519_scalarmulbase_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index e66492083f..00b91fe1aa 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -38,23 +38,22 @@ #define xpy_2 (2*NUMSIZE)(%rsp) #define kxy_2 (3*NUMSIZE)(%rsp) -#define acc (4*NUMSIZE)(%rsp) -#define x_1 (4*NUMSIZE)(%rsp) -#define y_1 (5*NUMSIZE)(%rsp) -#define z_1 (6*NUMSIZE)(%rsp) -#define w_1 (7*NUMSIZE)(%rsp) -#define x_3 (4*NUMSIZE)(%rsp) -#define y_3 (5*NUMSIZE)(%rsp) -#define z_3 (6*NUMSIZE)(%rsp) -#define w_3 (7*NUMSIZE)(%rsp) - -#define tmpspace (8*NUMSIZE)(%rsp) -#define t0 (8*NUMSIZE)(%rsp) -#define t1 (9*NUMSIZE)(%rsp) -#define t2 (10*NUMSIZE)(%rsp) -#define t3 (11*NUMSIZE)(%rsp) -#define t4 (12*NUMSIZE)(%rsp) -#define t5 (13*NUMSIZE)(%rsp) +#define t0 (4*NUMSIZE)(%rsp) +#define t1 (5*NUMSIZE)(%rsp) +#define t2 (6*NUMSIZE)(%rsp) +#define t3 (7*NUMSIZE)(%rsp) +#define t4 (8*NUMSIZE)(%rsp) +#define t5 (9*NUMSIZE)(%rsp) + +#define acc (10*NUMSIZE)(%rsp) +#define x_1 (10*NUMSIZE)(%rsp) +#define y_1 (11*NUMSIZE)(%rsp) +#define z_1 (12*NUMSIZE)(%rsp) +#define w_1 (13*NUMSIZE)(%rsp) +#define x_3 (10*NUMSIZE)(%rsp) +#define y_3 (11*NUMSIZE)(%rsp) +#define z_3 (12*NUMSIZE)(%rsp) +#define w_3 (13*NUMSIZE)(%rsp) // Stable homes for the input result pointer, and other variables @@ -73,6 +72,15 @@ #define NSPACE (15*NUMSIZE+8) +// Syntactic variants to make x86_att version simpler to generate + +#define SCALAR 0 +#define TABENT (1*NUMSIZE) +#define ACC (10*NUMSIZE) +#define X3 (10*NUMSIZE) +#define Z3 (12*NUMSIZE) +#define W3 (13*NUMSIZE) + // Macro wrapping up the basic field multiplication, only trivially // different from a pure function call to bignum_mul_p25519_alt. @@ -413,12 +421,12 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): pushq %rsi movq %rcx, %rdi movq %rdx, %rsi - callq edwards25519_scalarmulbase_alt_curve25519_x25519base_standard + callq edwards25519_scalarmulbase_alt_standard popq %rsi popq %rdi ret -edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: +edwards25519_scalarmulbase_alt_standard: #endif // Save registers, make room for temps, preserve input arguments. @@ -489,11 +497,11 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // And before we store the scalar, test and reset bit 251 to // initialize the main loop just below. - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) btr $59, %r11 - movq %r11, 24(%rsp) + movq %r11, SCALAR+24(%rsp) // The main part of the computation is in extended-projective coordinates // (X,Y,Z,T), representing an affine point on the edwards25519 curve @@ -504,75 +512,75 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. - leaq edwards25519_scalarmulbase_alt_edwards25519_0g(%rip), %r10 - leaq edwards25519_scalarmulbase_alt_edwards25519_251g(%rip), %r11 + leaq edwards25519_scalarmulbase_alt_0g(%rip), %r10 + leaq edwards25519_scalarmulbase_alt_251g(%rip), %r11 movq (%r10), %rax movq (%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*16(%rsp) + movq %rax, ACC(%rsp) movq 8*1(%r10), %rax movq 8*1(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*17(%rsp) + movq %rax, ACC+8(%rsp) movq 8*2(%r10), %rax movq 8*2(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*18(%rsp) + movq %rax, ACC+16(%rsp) movq 8*3(%r10), %rax movq 8*3(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*19(%rsp) + movq %rax, ACC+24(%rsp) movq 8*4(%r10), %rax movq 8*4(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*20(%rsp) + movq %rax, ACC+32(%rsp) movq 8*5(%r10), %rax movq 8*5(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*21(%rsp) + movq %rax, ACC+40(%rsp) movq 8*6(%r10), %rax movq 8*6(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*22(%rsp) + movq %rax, ACC+48(%rsp) movq 8*7(%r10), %rax movq 8*7(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*23(%rsp) + movq %rax, ACC+56(%rsp) movl $1, %eax - movq %rax, 8*24(%rsp) + movq %rax, ACC+64(%rsp) movl $0, %eax - movq %rax, 8*25(%rsp) - movq %rax, 8*26(%rsp) - movq %rax, 8*27(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) movq 8*8(%r10), %rax movq 8*8(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*28(%rsp) + movq %rax, ACC+96(%rsp) movq 8*9(%r10), %rax movq 8*9(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*29(%rsp) + movq %rax, ACC+104(%rsp) movq 8*10(%r10), %rax movq 8*10(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*30(%rsp) + movq %rax, ACC+112(%rsp) movq 8*11(%r10), %rax movq 8*11(%r11), %rcx cmovcq %rcx, %rax - movq %rax, 8*31(%rsp) + movq %rax, ACC+120(%rsp) // The counter "i" tracks the bit position for which the scalar has // already been absorbed, starting at 0 and going up in chunks of 4. @@ -588,7 +596,7 @@ edwards25519_scalarmulbase_alt_curve25519_x25519base_standard: // end because we made sure bit 251 is clear in the reduced scalar. movq $0, i - leaq edwards25519_scalarmulbase_alt_edwards25519_gtable(%rip), %rax + leaq edwards25519_scalarmulbase_alt_gtable(%rip), %rax movq %rax, tab movq $0, bias @@ -880,26 +888,26 @@ edwards25519_scalarmulbase_alt_scalarloop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 32(%rsp) - movq %r8, 64(%rsp) + movq %rsi, TABENT(%rsp) + movq %r8, TABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 40(%rsp) - movq %r9, 72(%rsp) + movq %rsi, TABENT+8(%rsp) + movq %r9, TABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 48(%rsp) - movq %r10, 80(%rsp) + movq %rsi, TABENT+16(%rsp) + movq %r10, TABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 56(%rsp) - movq %r11, 88(%rsp) + movq %rsi, TABENT+24(%rsp) + movq %r11, TABENT+56(%rsp) movq $-19, %rax movq $-1, %rbx @@ -920,10 +928,10 @@ edwards25519_scalarmulbase_alt_scalarloop: cmovzq %r13, %rbx cmovzq %r14, %rcx cmovzq %r15, %rdx - movq %rax, 96(%rsp) - movq %rbx, 104(%rsp) - movq %rcx, 112(%rsp) - movq %rdx, 120(%rsp) + movq %rax, TABENT+64(%rsp) + movq %rbx, TABENT+72(%rsp) + movq %rcx, TABENT+80(%rsp) + movq %rdx, TABENT+88(%rsp) // Extended-projective and precomputed mixed addition. // This is effectively the same as calling the standalone @@ -960,10 +968,10 @@ edwards25519_scalarmulbase_alt_scalarloop: // point on we don't need any normalization of the coordinates // except for making sure that they fit in 4 digits. - movq 128(%rsp), %r8 - movq 136(%rsp), %r9 - movq 144(%rsp), %r10 - movq 152(%rsp), %r11 + movq X3(%rsp), %r8 + movq X3+8(%rsp), %r9 + movq X3+16(%rsp), %r10 + movq X3+24(%rsp), %r11 movq $0xffffffffffffffda, %r12 subq %r8, %r12 movq $0xffffffffffffffff, %r13 @@ -972,424 +980,1377 @@ edwards25519_scalarmulbase_alt_scalarloop: sbbq %r10, %r14 movq $0xffffffffffffffff, %r15 sbbq %r11, %r15 - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax btq $63, %rax cmovcq %r12, %r8 cmovcq %r13, %r9 cmovcq %r14, %r10 cmovcq %r15, %r11 - movq %r8, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) + movq %r8, X3(%rsp) + movq %r9, X3+8(%rsp) + movq %r10, X3+16(%rsp) + movq %r11, X3+24(%rsp) // Now we need to map out of the extended-projective representation // (X,Y,Z,W) back to the affine form (x,y) = (X/Z,Y/Z). This means // first calling the modular inverse to get w_3 = 1/z_3. - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 192(%rsp), %rdx - leaq edwards25519_scalarmulbase_alt_p_25519(%rip), %rcx - leaq 256(%rsp), %r8 - -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. - - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + leaq W3(%rsp), %rdi + leaq Z3(%rsp), %rsi + +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, x_3, y_3, +// z_3 and w_3. + + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmulbase_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmulbase_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp edwards25519_scalarmulbase_alt_midloop +edwards25519_scalarmulbase_alt_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmulbase_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_wmontend -edwards25519_scalarmulbase_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wmontloop -edwards25519_scalarmulbase_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_wcorrloop + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmulbase_alt_zmontend -edwards25519_scalarmulbase_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zmontloop -edwards25519_scalarmulbase_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmulbase_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmulbase_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +edwards25519_scalarmulbase_alt_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmulbase_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmulbase_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmulbase_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmulbase_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmulbase_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmulbase_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmulbase_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne edwards25519_scalarmulbase_alt_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // The final result is x = X * inv(Z), y = Y * inv(Z). // These are the only operations in the whole computation that @@ -1420,18 +2381,10 @@ edwards25519_scalarmulbase_alt_fliploop: // .section .rodata // **************************************************************************** -// The modulus, for the modular inverse - -edwards25519_scalarmulbase_alt_p_25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // 0 * B = 0 and 2^251 * B in extended-projective coordinates // but with Z = 1 assumed and hence left out, so they are (X,Y,T) only. -edwards25519_scalarmulbase_alt_edwards25519_0g: +edwards25519_scalarmulbase_alt_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 @@ -1448,7 +2401,7 @@ edwards25519_scalarmulbase_alt_edwards25519_0g: .quad 0x0000000000000000 .quad 0x0000000000000000 -edwards25519_scalarmulbase_alt_edwards25519_251g: +edwards25519_scalarmulbase_alt_251g: .quad 0x525f946d7c7220e7 .quad 0x4636b0b2f1e35444 @@ -1466,7 +2419,7 @@ edwards25519_scalarmulbase_alt_edwards25519_251g: // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. -edwards25519_scalarmulbase_alt_edwards25519_gtable: +edwards25519_scalarmulbase_alt_gtable: // 2^0 * 1 * G diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 0138d1a4b2..35fd7f4ffc 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -42,24 +42,33 @@ #define scalar (0*NUMSIZE)(%rsp) #define bscalar (1*NUMSIZE)(%rsp) -#define acc (3*NUMSIZE)(%rsp) +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) -#define tabent (7*NUMSIZE)(%rsp) -#define btabent (11*NUMSIZE)(%rsp) +#define acc (9*NUMSIZE)(%rsp) -#define tab (14*NUMSIZE)(%rsp) +#define tab (13*NUMSIZE)(%rsp) // Additional variables kept on the stack -#define bf 2*NUMSIZE(%rsp) -#define cf 2*NUMSIZE+8(%rsp) -#define i 2*NUMSIZE+16(%rsp) -#define res 2*NUMSIZE+24(%rsp) +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) // Total size to reserve on the stack (excluding local subroutines) #define NSPACE (46*NUMSIZE) +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + // Sub-references used in local subroutines with local stack #define x_0 0(%rdi) @@ -493,10 +502,10 @@ edwards25519_scalarmuldouble_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) movq (%rsi), %r8 movq 8(%rsi), %r9 @@ -517,10 +526,10 @@ edwards25519_scalarmuldouble_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) // Create table of multiples 1..8 of the general input point at "tab". // Reduce the input coordinates x and y modulo 2^256 - 38 first, for the @@ -541,13 +550,13 @@ edwards25519_scalarmuldouble_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 448(%rsp) + movq %rax, TAB(%rsp) cmovncq %r9, %rbx - movq %rbx, 456(%rsp) + movq %rbx, TAB+8(%rsp) cmovncq %r10, %rcx - movq %rcx, 464(%rsp) + movq %rcx, TAB+16(%rsp) cmovncq %r11, %rsi - movq %rsi, 472(%rsp) + movq %rsi, TAB+24(%rsp) movl $38, %eax movq 32(%rdx), %r8 @@ -562,69 +571,69 @@ edwards25519_scalarmuldouble_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 480(%rsp) + movq %rax, TAB+32(%rsp) cmovncq %r9, %rbx - movq %rbx, 488(%rsp) + movq %rbx, TAB+40(%rsp) cmovncq %r10, %rcx - movq %rcx, 496(%rsp) + movq %rcx, TAB+48(%rsp) cmovncq %r11, %rsi - movq %rsi, 504(%rsp) + movq %rsi, TAB+56(%rsp) movl $1, %eax - movq %rax, 512(%rsp) + movq %rax, TAB+64(%rsp) xorl %eax, %eax - movq %rax, 520(%rsp) - movq %rax, 528(%rsp) - movq %rax, 536(%rsp) + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) - leaq 544(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 480(%rsp), %rbp + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp mul_4(x_0,x_1,x_2) // Multiple 2 - leaq 576(%rsp), %rdi - leaq 448(%rsp), %rsi + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 3 - leaq 704(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 576(%rsp), %rbp + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 4 - leaq 832(%rsp), %rdi - leaq 576(%rsp), %rsi + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 5 - leaq 960(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 832(%rsp), %rbp + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 6 - leaq 1088(%rsp), %rdi - leaq 704(%rsp), %rsi + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Multiple 7 - leaq 1216(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 1088(%rsp), %rbp + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Multiple 8 - leaq 1344(%rsp), %rdi - leaq 832(%rsp), %rsi + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Handle the initialization, starting the loop counter at i = 252 @@ -636,7 +645,7 @@ edwards25519_scalarmuldouble_standard: // Index for btable entry... - movq 56(%rsp), %rax + movq BSCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -872,22 +881,22 @@ edwards25519_scalarmuldouble_standard: movq 88(%rbp), %rsi cmovzq %rsi, %r15 - movq %rax, 352(%rsp) - movq %rbx, 360(%rsp) - movq %rcx, 368(%rsp) - movq %rdx, 376(%rsp) - movq %r8, 384(%rsp) - movq %r9, 392(%rsp) - movq %r10, 400(%rsp) - movq %r11, 408(%rsp) - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Index for table entry... - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -903,7 +912,7 @@ edwards25519_scalarmuldouble_standard: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1056,18 +1065,18 @@ edwards25519_scalarmuldouble_standard: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // ...followed by the X and W fields - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1229,20 +1238,20 @@ edwards25519_scalarmuldouble_standard: movq 120(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Add those elements to initialize the accumulator for bit position 252 - leaq 96(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_pepadd // Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint @@ -1256,8 +1265,8 @@ edwards25519_scalarmuldouble_loop: // Double to acc' = 2 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Get btable entry, first getting the adjusted bitfield... @@ -1528,26 +1537,26 @@ edwards25519_scalarmuldouble_loop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 352(%rsp) - movq %r8, 384(%rsp) + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 360(%rsp) - movq %r9, 392(%rsp) + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 368(%rsp) - movq %r10, 400(%rsp) + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 376(%rsp) - movq %r11, 408(%rsp) + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) xorq %rdi, %r12 xorq %rdi, %r13 @@ -1558,10 +1567,10 @@ edwards25519_scalarmuldouble_loop: sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Get table entry, first getting the adjusted bitfield... @@ -1592,7 +1601,7 @@ edwards25519_scalarmuldouble_loop: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1745,18 +1754,18 @@ edwards25519_scalarmuldouble_loop: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // Now do the X and W fields... - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1950,51 +1959,51 @@ edwards25519_scalarmuldouble_loop: sbbq $0, %rcx sbbq $0, %rdx - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) subq %rdi, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Double to acc' = 4 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Add tabent := tabent + btabent - leaq 224(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_pepadd // Double to acc' = 8 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_pdouble // Double to acc' = 16 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_epdouble // Add table entry, acc := acc + tabent - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_epadd // Loop down @@ -2003,423 +2012,1375 @@ edwards25519_scalarmuldouble_loop: testq %rax, %rax jnz edwards25519_scalarmuldouble_loop -// Modular inverse setup +// Prepare to call the modular inverse function to get tab = 1/z - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq edwards25519_scalarmuldouble_p25519(%rip), %rcx - leaq 352(%rsp), %r8 + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmuldouble_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmuldouble_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmuldouble_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmuldouble_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_wmontend -edwards25519_scalarmuldouble_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_wmontloop -edwards25519_scalarmuldouble_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_zmontend -edwards25519_scalarmuldouble_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_zmontloop -edwards25519_scalarmuldouble_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmuldouble_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmuldouble_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmuldouble_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmuldouble_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmuldouble_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Store result movq res, %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) movq res, %rdi addq $32, %rdi - leaq 128(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) // Restore stack and registers @@ -2528,14 +3489,6 @@ edwards25519_scalarmuldouble_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index 7f3dffa395..e17d10b47a 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -42,24 +42,33 @@ #define scalar (0*NUMSIZE)(%rsp) #define bscalar (1*NUMSIZE)(%rsp) -#define acc (3*NUMSIZE)(%rsp) +#define tabent (2*NUMSIZE)(%rsp) +#define btabent (6*NUMSIZE)(%rsp) -#define tabent (7*NUMSIZE)(%rsp) -#define btabent (11*NUMSIZE)(%rsp) +#define acc (9*NUMSIZE)(%rsp) -#define tab (14*NUMSIZE)(%rsp) +#define tab (13*NUMSIZE)(%rsp) // Additional variables kept on the stack -#define bf 2*NUMSIZE(%rsp) -#define cf 2*NUMSIZE+8(%rsp) -#define i 2*NUMSIZE+16(%rsp) -#define res 2*NUMSIZE+24(%rsp) +#define bf 45*NUMSIZE(%rsp) +#define cf 45*NUMSIZE+8(%rsp) +#define i 45*NUMSIZE+16(%rsp) +#define res 45*NUMSIZE+24(%rsp) // Total size to reserve on the stack (excluding local subroutines) #define NSPACE (46*NUMSIZE) +// Syntactic variants to make x86_att forms easier to generate + +#define SCALAR (0*NUMSIZE) +#define BSCALAR (1*NUMSIZE) +#define TABENT (2*NUMSIZE) +#define BTABENT (6*NUMSIZE) +#define ACC (9*NUMSIZE) +#define TAB (13*NUMSIZE) + // Sub-references used in local subroutines with local stack #define x_0 0(%rdi) @@ -610,10 +619,10 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) + movq %r8, BSCALAR(%rsp) + movq %r9, BSCALAR+8(%rsp) + movq %r10, BSCALAR+16(%rsp) + movq %r11, BSCALAR+24(%rsp) movq (%rsi), %r8 movq 8(%rsi), %r9 @@ -634,10 +643,10 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r13, %r9 adcq %r14, %r10 adcq %r15, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %r8, SCALAR(%rsp) + movq %r9, SCALAR+8(%rsp) + movq %r10, SCALAR+16(%rsp) + movq %r11, SCALAR+24(%rsp) // Create table of multiples 1..8 of the general input point at "tab". // Reduce the input coordinates x and y modulo 2^256 - 38 first, for the @@ -658,13 +667,13 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 448(%rsp) + movq %rax, TAB(%rsp) cmovncq %r9, %rbx - movq %rbx, 456(%rsp) + movq %rbx, TAB+8(%rsp) cmovncq %r10, %rcx - movq %rcx, 464(%rsp) + movq %rcx, TAB+16(%rsp) cmovncq %r11, %rsi - movq %rsi, 472(%rsp) + movq %rsi, TAB+24(%rsp) movl $38, %eax movq 32(%rdx), %r8 @@ -679,69 +688,69 @@ edwards25519_scalarmuldouble_alt_standard: adcq %r10, %rcx adcq %r11, %rsi cmovncq %r8, %rax - movq %rax, 480(%rsp) + movq %rax, TAB+32(%rsp) cmovncq %r9, %rbx - movq %rbx, 488(%rsp) + movq %rbx, TAB+40(%rsp) cmovncq %r10, %rcx - movq %rcx, 496(%rsp) + movq %rcx, TAB+48(%rsp) cmovncq %r11, %rsi - movq %rsi, 504(%rsp) + movq %rsi, TAB+56(%rsp) movl $1, %eax - movq %rax, 512(%rsp) + movq %rax, TAB+64(%rsp) xorl %eax, %eax - movq %rax, 520(%rsp) - movq %rax, 528(%rsp) - movq %rax, 536(%rsp) + movq %rax, TAB+72(%rsp) + movq %rax, TAB+80(%rsp) + movq %rax, TAB+88(%rsp) - leaq 544(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 480(%rsp), %rbp + leaq TAB+96(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+32(%rsp), %rbp mul_4(x_0,x_1,x_2) // Multiple 2 - leaq 576(%rsp), %rdi - leaq 448(%rsp), %rsi + leaq TAB+1*128(%rsp), %rdi + leaq TAB(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 3 - leaq 704(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 576(%rsp), %rbp + leaq TAB+2*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+1*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 4 - leaq 832(%rsp), %rdi - leaq 576(%rsp), %rsi + leaq TAB+3*128(%rsp), %rdi + leaq TAB+1*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 5 - leaq 960(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 832(%rsp), %rbp + leaq TAB+4*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+3*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 6 - leaq 1088(%rsp), %rdi - leaq 704(%rsp), %rsi + leaq TAB+5*128(%rsp), %rdi + leaq TAB+2*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Multiple 7 - leaq 1216(%rsp), %rdi - leaq 448(%rsp), %rsi - leaq 1088(%rsp), %rbp + leaq TAB+6*128(%rsp), %rdi + leaq TAB(%rsp), %rsi + leaq TAB+5*128(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Multiple 8 - leaq 1344(%rsp), %rdi - leaq 832(%rsp), %rsi + leaq TAB+7*128(%rsp), %rdi + leaq TAB+3*128(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Handle the initialization, starting the loop counter at i = 252 @@ -753,7 +762,7 @@ edwards25519_scalarmuldouble_alt_standard: // Index for btable entry... - movq 56(%rsp), %rax + movq BSCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -989,22 +998,22 @@ edwards25519_scalarmuldouble_alt_standard: movq 88(%rbp), %rsi cmovzq %rsi, %r15 - movq %rax, 352(%rsp) - movq %rbx, 360(%rsp) - movq %rcx, 368(%rsp) - movq %rdx, 376(%rsp) - movq %r8, 384(%rsp) - movq %r9, 392(%rsp) - movq %r10, 400(%rsp) - movq %r11, 408(%rsp) - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %rax, BTABENT(%rsp) + movq %rbx, BTABENT+8(%rsp) + movq %rcx, BTABENT+16(%rsp) + movq %rdx, BTABENT+24(%rsp) + movq %r8, BTABENT+32(%rsp) + movq %r9, BTABENT+40(%rsp) + movq %r10, BTABENT+48(%rsp) + movq %r11, BTABENT+56(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Index for table entry... - movq 24(%rsp), %rax + movq SCALAR+24(%rsp), %rax shrq $60, %rax movq %rax, bf @@ -1020,7 +1029,7 @@ edwards25519_scalarmuldouble_alt_standard: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1173,18 +1182,18 @@ edwards25519_scalarmuldouble_alt_standard: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // ...followed by the X and W fields - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -1346,20 +1355,20 @@ edwards25519_scalarmuldouble_alt_standard: movq 120(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Add those elements to initialize the accumulator for bit position 252 - leaq 96(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_pepadd // Main loop with acc = [scalar/2^i] * point + [bscalar/2^i] * basepoint @@ -1373,8 +1382,8 @@ edwards25519_scalarmuldouble_alt_loop: // Double to acc' = 2 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Get btable entry, first getting the adjusted bitfield... @@ -1645,26 +1654,26 @@ edwards25519_scalarmuldouble_alt_loop: movq %rax, %rsi cmovnzq %r8, %rsi cmovnzq %rax, %r8 - movq %rsi, 352(%rsp) - movq %r8, 384(%rsp) + movq %rsi, BTABENT(%rsp) + movq %r8, BTABENT+32(%rsp) movq %rbx, %rsi cmovnzq %r9, %rsi cmovnzq %rbx, %r9 - movq %rsi, 360(%rsp) - movq %r9, 392(%rsp) + movq %rsi, BTABENT+8(%rsp) + movq %r9, BTABENT+40(%rsp) movq %rcx, %rsi cmovnzq %r10, %rsi cmovnzq %rcx, %r10 - movq %rsi, 368(%rsp) - movq %r10, 400(%rsp) + movq %rsi, BTABENT+16(%rsp) + movq %r10, BTABENT+48(%rsp) movq %rdx, %rsi cmovnzq %r11, %rsi cmovnzq %rdx, %r11 - movq %rsi, 376(%rsp) - movq %r11, 408(%rsp) + movq %rsi, BTABENT+24(%rsp) + movq %r11, BTABENT+56(%rsp) xorq %rdi, %r12 xorq %rdi, %r13 @@ -1675,10 +1684,10 @@ edwards25519_scalarmuldouble_alt_loop: sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 - movq %r12, 416(%rsp) - movq %r13, 424(%rsp) - movq %r14, 432(%rsp) - movq %r15, 440(%rsp) + movq %r12, BTABENT+64(%rsp) + movq %r13, BTABENT+72(%rsp) + movq %r14, BTABENT+80(%rsp) + movq %r15, BTABENT+88(%rsp) // Get table entry, first getting the adjusted bitfield... @@ -1709,7 +1718,7 @@ edwards25519_scalarmuldouble_alt_loop: xorl %r10d, %r10d xorl %r11d, %r11d - leaq 480(%rsp), %rbp + leaq TAB+32(%rsp), %rbp cmpq $1, bf movq (%rbp), %rsi @@ -1862,18 +1871,18 @@ edwards25519_scalarmuldouble_alt_loop: movq 56(%rbp), %rsi cmovzq %rsi, %r11 - movq %rax, 256(%rsp) - movq %rbx, 264(%rsp) - movq %rcx, 272(%rsp) - movq %rdx, 280(%rsp) - movq %r8, 288(%rsp) - movq %r9, 296(%rsp) - movq %r10, 304(%rsp) - movq %r11, 312(%rsp) + movq %rax, TABENT+32(%rsp) + movq %rbx, TABENT+40(%rsp) + movq %rcx, TABENT+48(%rsp) + movq %rdx, TABENT+56(%rsp) + movq %r8, TABENT+64(%rsp) + movq %r9, TABENT+72(%rsp) + movq %r10, TABENT+80(%rsp) + movq %r11, TABENT+88(%rsp) // Now do the X and W fields... - leaq 448(%rsp), %rbp + leaq TAB(%rsp), %rbp xorl %eax, %eax xorl %ebx, %ebx @@ -2067,51 +2076,51 @@ edwards25519_scalarmuldouble_alt_loop: sbbq $0, %rcx sbbq $0, %rdx - movq %rax, 224(%rsp) - movq %rbx, 232(%rsp) - movq %rcx, 240(%rsp) - movq %rdx, 248(%rsp) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) subq %rdi, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 - movq %r8, 320(%rsp) - movq %r9, 328(%rsp) - movq %r10, 336(%rsp) - movq %r11, 344(%rsp) + movq %r8, TABENT+96(%rsp) + movq %r9, TABENT+104(%rsp) + movq %r10, TABENT+112(%rsp) + movq %r11, TABENT+120(%rsp) // Double to acc' = 4 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Add tabent := tabent + btabent - leaq 224(%rsp), %rdi - leaq 224(%rsp), %rsi - leaq 352(%rsp), %rbp + leaq TABENT(%rsp), %rdi + leaq TABENT(%rsp), %rsi + leaq BTABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_pepadd // Double to acc' = 8 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_pdouble // Double to acc' = 16 * acc - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi callq edwards25519_scalarmuldouble_alt_epdouble // Add table entry, acc := acc + tabent - leaq 96(%rsp), %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rbp callq edwards25519_scalarmuldouble_alt_epadd // Loop down @@ -2120,423 +2129,1375 @@ edwards25519_scalarmuldouble_alt_loop: testq %rax, %rax jnz edwards25519_scalarmuldouble_alt_loop -// Modular inverse setup +// Prepare to call the modular inverse function to get tab = 1/z - movq $4, %rdi - leaq 224(%rsp), %rsi - leaq 160(%rsp), %rdx - leaq edwards25519_scalarmuldouble_alt_p25519(%rip), %rcx - leaq 352(%rsp), %r8 + leaq TAB(%rsp), %rdi + leaq ACC+64(%rsp), %rsi -// Inline copy of bignum_modinv, identical except for stripping out the -// prologue and epilogue saving and restoring registers and the initial -// test for k = 0 (which is trivially false here since k = 4). For more -// details and explanations see "x86/generic/bignum_modinv.S". Note -// that the stack it uses for its own temporaries is 80 bytes so it -// only overwrites local variables that are no longer needed. +// Inline copy of bignum_inv_p25519, identical except for stripping out +// the prologue and epilogue saving and restoring registers and making +// and reclaiming room on the stack. For more details and explanations see +// "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for +// its own temporaries is 208 bytes, so it has no effect on variables +// that are needed in the rest of our computation here: res, tab and acc. - movq %rsi, 0x40(%rsp) - movq %r8, 0x38(%rsp) - movq %rcx, 0x48(%rsp) - leaq (%r8,%rdi,8), %r10 - movq %r10, 0x30(%rsp) - leaq (%r10,%rdi,8), %r15 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_copyloop: - movq (%rdx,%r9,8), %rax - movq (%rcx,%r9,8), %rbx - movq %rax, (%r10,%r9,8) - movq %rbx, (%r15,%r9,8) - movq %rbx, (%r8,%r9,8) - movq %r11, (%rsi,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_copyloop - movq (%r8), %rax - movq %rax, %rbx - decq %rbx - movq %rbx, (%r8) - movq %rax, %rbp - movq %rax, %r12 - shlq $0x2, %rbp - subq %rbp, %r12 - xorq $0x2, %r12 - movq %r12, %rbp - imulq %rax, %rbp - movl $0x2, %eax - addq %rbp, %rax - addq $0x1, %rbp - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp - movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - imulq %rbp, %rbp + movq %rdi, 0xc0(%rsp) + xorl %eax, %eax + leaq -0x13(%rax), %rcx + notq %rax + movq %rcx, (%rsp) + movq %rax, 0x8(%rsp) + movq %rax, 0x10(%rsp) + btr $0x3f, %rax + movq %rax, 0x18(%rsp) + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 movl $0x1, %eax - addq %rbp, %rax - imulq %rax, %r12 - movq %r12, 0x28(%rsp) - movq %rdi, %rax - shlq $0x7, %rax - movq %rax, 0x20(%rsp) -edwards25519_scalarmuldouble_alt_outerloop: - movq 0x20(%rsp), %r13 - addq $0x3f, %r13 - shrq $0x6, %r13 - cmpq %rdi, %r13 - cmovaeq %rdi, %r13 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi - xorq %r11, %r11 - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_toploop: - movq (%r8,%r9,8), %rbx - movq (%r15,%r9,8), %rcx - movq %r11, %r10 - andq %r12, %r10 - andq %rbp, %r11 - movq %rbx, %rax - orq %rcx, %rax - negq %rax - cmovbq %r10, %r14 - cmovbq %r11, %rsi - cmovbq %rbx, %r12 - cmovbq %rcx, %rbp - sbbq %r11, %r11 - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_toploop - movq %r12, %rax - orq %rbp, %rax - bsrq %rax, %rcx - xorq $0x3f, %rcx - shldq %cl, %r14, %r12 - shldq %cl, %rsi, %rbp - movq (%r8), %rax - movq %rax, %r14 - movq (%r15), %rax - movq %rax, %rsi - movl $0x1, %r10d - movl $0x0, %r11d - movl $0x0, %ecx - movl $0x1, %edx - movl $0x3a, %r9d - movq %rdi, 0x8(%rsp) - movq %r13, 0x10(%rsp) - movq %r8, (%rsp) - movq %r15, 0x18(%rsp) -edwards25519_scalarmuldouble_alt_innerloop: + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, 0x20(%rsp) + movq %rcx, 0x28(%rsp) + movq %r8, 0x30(%rsp) + movq %r9, 0x38(%rsp) xorl %eax, %eax + movq %rax, 0x40(%rsp) + movq %rax, 0x48(%rsp) + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movabsq $0xa0f99e2375022099, %rax + movq %rax, 0x60(%rsp) + movabsq $0xa8c68f3f1d132595, %rax + movq %rax, 0x68(%rsp) + movabsq $0x6c6c893805ac5242, %rax + movq %rax, 0x70(%rsp) + movabsq $0x276508b241770615, %rax + movq %rax, 0x78(%rsp) + movq $0xa, 0x90(%rsp) + movq $0x1, 0x98(%rsp) + jmp curve25519_x25519_midloop +curve25519_x25519_inverseloop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0x80(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0x88(%rsp) xorl %ebx, %ebx - xorq %r8, %r8 - xorq %r15, %r15 - btq $0x0, %r14 - cmovbq %rbp, %rax - cmovbq %rsi, %rbx - cmovbq %rcx, %r8 - cmovbq %rdx, %r15 - movq %r14, %r13 - subq %rbx, %r14 - subq %r13, %rbx - movq %r12, %rdi - subq %rax, %rdi - cmovbq %r12, %rbp - leaq -0x1(%rdi), %r12 - cmovbq %rbx, %r14 - cmovbq %r13, %rsi - notq %r12 - cmovbq %r10, %rcx - cmovbq %r11, %rdx - cmovaeq %rdi, %r12 - shrq $1, %r14 - addq %r8, %r10 - addq %r15, %r11 - shrq $1, %r12 - addq %rcx, %rcx - addq %rdx, %rdx - decq %r9 - jne edwards25519_scalarmuldouble_alt_innerloop - movq 0x8(%rsp), %rdi - movq 0x10(%rsp), %r13 - movq (%rsp), %r8 - movq 0x18(%rsp), %r15 - movq %r10, (%rsp) - movq %r11, 0x8(%rsp) - movq %rcx, 0x10(%rsp) - movq %rdx, 0x18(%rsp) - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - xorq %r14, %r14 - xorq %rsi, %rsi - xorq %r10, %r10 - xorq %r11, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_congloop: - movq (%r8,%r9,8), %rcx movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r12 - movq 0x10(%rsp), %rax - mulq %rcx + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x20(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x20(%rsp), %rax + xorq %r15, %rax + mulq %r14 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %rbp - movq (%r15,%r9,8), %rcx + adcq %rdx, %rbp + xorl %ecx, %ecx movq 0x8(%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq %rdx, %r12 - shrdq $0x3a, %r14, %r10 - movq %r10, (%r8,%r9,8) - movq %r14, %r10 - movq %r12, %r14 + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x20(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x28(%rsp) movq 0x18(%rsp), %rax - mulq %rcx + xorq %r9, %rax + movq %rax, %rbp + sarq $0x3f, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 addq %rax, %rsi adcq %rdx, %rbp - shrdq $0x3a, %rsi, %r11 - movq %r11, (%r15,%r9,8) - movq %rsi, %r11 - movq %rbp, %rsi - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_congloop - shldq $0x6, %r10, %r14 - shldq $0x6, %r11, %rsi - movq 0x48(%rsp), %r15 - movq (%r8), %rbx - movq 0x28(%rsp), %r12 - imulq %rbx, %r12 - movq (%r15), %rax + movq 0x38(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + xorq %r13, %rax + movq %rax, %rsi + sarq $0x3f, %rsi + andq %r12, %rsi + negq %rsi mulq %r12 - addq %rbx, %rax - movq %rdx, %r10 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_alt_wmontend -edwards25519_scalarmuldouble_alt_wmontloop: - adcq (%r8,%r9,8), %r10 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $0x3f, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x30(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x38(%rsp) + movq 0x80(%rsp), %rbx + movq 0x88(%rsp), %rbp + xorl %ecx, %ecx + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x40(%rsp), %rax + xorq %r13, %rax mulq %r12 - subq %rbx, %rdx - addq %r10, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r10 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_wmontloop -edwards25519_scalarmuldouble_alt_wmontend: - adcq %r14, %r10 - movq %r10, -0x8(%r8,%rdi,8) - sbbq %r10, %r10 - negq %r10 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_wcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_wcmploop - sbbq $0x0, %r10 - sbbq %r10, %r10 - notq %r10 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_wcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r10, %rbx - negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_wcorrloop + movq %rbx, 0x40(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x60(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x60(%rsp) + xorl %ebx, %ebx + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x48(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x48(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x68(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x68(%rsp) + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x70(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x70(%rsp) + movq 0x58(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq %rdx, %rbx + shldq $0x1, %rcx, %rdx + sarq $0x3f, %rbx + addq %rbx, %rdx + movl $0x13, %eax + imulq %rdx movq 0x40(%rsp), %r8 - movq (%r8), %rbx - movq 0x28(%rsp), %rbp - imulq %rbx, %rbp - movq (%r15), %rax - mulq %rbp - addq %rbx, %rax - movq %rdx, %r11 - movl $0x1, %r9d - movq %rdi, %rcx - decq %rcx - je edwards25519_scalarmuldouble_alt_zmontend -edwards25519_scalarmuldouble_alt_zmontloop: - adcq (%r8,%r9,8), %r11 - sbbq %rbx, %rbx - movq (%r15,%r9,8), %rax - mulq %rbp - subq %rbx, %rdx - addq %r11, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %rdx, %r11 - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_zmontloop -edwards25519_scalarmuldouble_alt_zmontend: - adcq %rsi, %r11 - movq %r11, -0x8(%r8,%rdi,8) - sbbq %r11, %r11 - negq %r11 - movq %rdi, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_zcmploop: - movq (%r8,%r9,8), %rax - sbbq (%r15,%r9,8), %rax - incq %r9 - decq %rcx - jne edwards25519_scalarmuldouble_alt_zcmploop - sbbq $0x0, %r11 - sbbq %r11, %r11 - notq %r11 - xorq %rcx, %rcx - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_zcorrloop: - movq (%r8,%r9,8), %rax - movq (%r15,%r9,8), %rbx - andq %r11, %rbx + addq %rax, %r8 + movq %r8, 0x40(%rsp) + movq 0x48(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x48(%rsp) + movq 0x50(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, 0x50(%rsp) + adcq %rbx, %rcx + shlq $0x3f, %rax + addq %rax, %rcx + movq 0x58(%rsp), %rax + movq %rcx, 0x58(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx negq %rcx - sbbq %rbx, %rax - sbbq %rcx, %rcx - movq %rax, (%r8,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_zcorrloop - movq 0x30(%rsp), %r8 - leaq (%r8,%rdi,8), %r15 - xorq %r9, %r9 - xorq %r12, %r12 - xorq %r14, %r14 - xorq %rbp, %rbp - xorq %rsi, %rsi -edwards25519_scalarmuldouble_alt_crossloop: - movq (%r8,%r9,8), %rcx - movq (%rsp), %rax - mulq %rcx - addq %rax, %r14 - adcq $0x0, %rdx - movq %rdx, %r10 - movq 0x10(%rsp), %rax - mulq %rcx + mulq %r12 addq %rax, %rsi - adcq $0x0, %rdx - movq %rdx, %r11 - movq (%r15,%r9,8), %rcx - movq 0x8(%rsp), %rax - mulq %rcx - subq %r12, %rdx - subq %rax, %r14 - sbbq %rdx, %r10 - sbbq %r12, %r12 - movq %r14, (%r8,%r9,8) - movq %r10, %r14 - movq 0x18(%rsp), %rax - mulq %rcx - subq %rbp, %rdx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rdx, %rcx + shldq $0x1, %rsi, %rdx + sarq $0x3f, %rcx + movl $0x13, %eax + addq %rcx, %rdx + imulq %rdx + movq 0x60(%rsp), %r8 + addq %rax, %r8 + movq %r8, 0x60(%rsp) + movq 0x68(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, 0x68(%rsp) + movq 0x70(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, 0x70(%rsp) + adcq %rcx, %rsi + shlq $0x3f, %rax + addq %rax, %rsi + movq %rsi, 0x78(%rsp) +curve25519_x25519_midloop: + movq 0x98(%rsp), %rsi + movq (%rsp), %rdx + movq 0x20(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 subq %rax, %rsi - sbbq %rdx, %r11 - sbbq %rbp, %rbp - movq %rsi, (%r15,%r9,8) - movq %r11, %rsi - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_crossloop - xorq %r9, %r9 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r12, %r14 - xorq %rbp, %rsi -edwards25519_scalarmuldouble_alt_optnegloop: - movq (%r8,%r9,8), %rax - xorq %r12, %rax - negq %r10 - adcq $0x0, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rax - xorq %rbp, %rax - negq %r11 - adcq $0x0, %rax - sbbq %r11, %r11 - movq %rax, (%r15,%r9,8) - incq %r9 - cmpq %r13, %r9 - jb edwards25519_scalarmuldouble_alt_optnegloop - subq %r10, %r14 - subq %r11, %rsi - movq %r13, %r9 -edwards25519_scalarmuldouble_alt_shiftloop: - movq -0x8(%r8,%r9,8), %rax - movq %rax, %r10 - shrdq $0x3a, %r14, %rax - movq %rax, -0x8(%r8,%r9,8) - movq %r10, %r14 - movq -0x8(%r15,%r9,8), %rax - movq %rax, %r11 - shrdq $0x3a, %rsi, %rax - movq %rax, -0x8(%r15,%r9,8) - movq %r11, %rsi - decq %r9 - jne edwards25519_scalarmuldouble_alt_shiftloop - notq %rbp - movq 0x48(%rsp), %rcx - movq 0x38(%rsp), %r8 - movq 0x40(%rsp), %r15 - movq %r12, %r10 - movq %rbp, %r11 - xorq %r9, %r9 -edwards25519_scalarmuldouble_alt_fliploop: - movq %rbp, %rdx - movq (%rcx,%r9,8), %rax - andq %rax, %rdx - andq %r12, %rax - movq (%r8,%r9,8), %rbx - xorq %r12, %rbx - negq %r10 - adcq %rbx, %rax - sbbq %r10, %r10 - movq %rax, (%r8,%r9,8) - movq (%r15,%r9,8), %rbx - xorq %rbp, %rbx - negq %r11 - adcq %rbx, %rdx - sbbq %r11, %r11 - movq %rdx, (%r15,%r9,8) - incq %r9 - cmpq %rdi, %r9 - jb edwards25519_scalarmuldouble_alt_fliploop - subq $0x3a, 0x20(%rsp) - ja edwards25519_scalarmuldouble_alt_outerloop + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xa0(%rsp) + movq %rbx, 0xa8(%rsp) + movq %rdi, 0xb0(%rsp) + movq %rcx, 0xb8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x20(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xa0(%rsp), %rax + imulq %r8, %rax + movq 0xb0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xa8(%rsp), %r8 + imulq 0xb8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xa0(%rsp), %rax + imulq %r10, %rax + movq 0xb0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xa8(%rsp), %r10 + imulq 0xb8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0x98(%rsp) + decq 0x90(%rsp) + jne curve25519_x25519_inverseloop + movq (%rsp), %rax + movq 0x20(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x60(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x48(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x68(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x70(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x58(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x78(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r9, %rax + shldq $0x1, %r15, %rax + sarq $0x3f, %r9 + movl $0x13, %ebx + leaq 0x1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $0x3f, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + movq 0xc0(%rsp), %rdi + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) // Store result movq res, %rdi - leaq 96(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) movq res, %rdi addq $32, %rdi - leaq 128(%rsp), %rsi - leaq 224(%rsp), %rbp + leaq ACC+32(%rsp), %rsi + leaq TAB(%rsp), %rbp mul_p25519(x_0,x_1,x_2) // Restore stack and registers @@ -2645,14 +3606,6 @@ edwards25519_scalarmuldouble_alt_pepadd: // .section .rodata // **************************************************************************** -// The modulus p_25519 = 2^255 - 19, for the modular inverse - -edwards25519_scalarmuldouble_alt_p25519: - .quad 0xffffffffffffffed - .quad 0xffffffffffffffff - .quad 0xffffffffffffffff - .quad 0x7fffffffffffffff - // Precomputed table of multiples of generator for edwards25519 // all in precomputed extended-projective (y-x,x+y,2*d*x*y) triples. From b5e5b8c709eb8b7c35f92d2ab957727df3be7e6b Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 14 Nov 2023 13:47:29 -0800 Subject: [PATCH 07/24] Improve integer operation support in BOUNDER_RULE and BOUNDER_TAC In general, BOUNDER_RULE now directly handles operations over Z and N, assuming an outer real_of_int / real_of_num cast into R (this is also automated in the tactic form BOUNDER_TAC). In particular, this change can greatly improve bounds for terms involving integer or natural number division and remainder (DIV, div, MOD and rem) as well as cutoff subtraction over N. There is also now support for conditionals, though the condition is not used as extra context, simply being the basis for a case split. This update rolls in various trivial typographic fixes in comments. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/ccefa2a7109a9a784e5fea00da1817832dd28f73 --- arm/curve25519/edwards25519_scalarmulbase.S | 2 +- arm/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- arm/generic/bignum_copy_row_from_table_8n_neon.S | 2 +- arm/p384/Makefile | 2 +- arm/p521/Makefile | 2 +- x86_att/curve25519/edwards25519_scalarmulbase.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 8c9d0f9193..89e98494ac 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -577,7 +577,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase): // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index 03e5598f2c..e89d58b378 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -419,7 +419,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index 00ea37eaaf..d6fc9121f9 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -1514,7 +1514,7 @@ edwards25519_scalarmuldouble_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. cmp cf, xzr diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index ad05eae1fb..54cebef997 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1298,7 +1298,7 @@ edwards25519_scalarmuldouble_alt_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. cmp cf, xzr diff --git a/arm/generic/bignum_copy_row_from_table_8n_neon.S b/arm/generic/bignum_copy_row_from_table_8n_neon.S index 80db20d6b6..e17ebceeff 100644 --- a/arm/generic/bignum_copy_row_from_table_8n_neon.S +++ b/arm/generic/bignum_copy_row_from_table_8n_neon.S @@ -3,7 +3,7 @@ // ---------------------------------------------------------------------------- // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] -// into z[0..width-1]. width must be a mutiple of 8. +// into z[0..width-1]. width must be a multiple of 8. // This function is constant-time with respect to the value of `idx`. This is // achieved by reading the whole table and using the bit-masking to get the // `idx`-th row. diff --git a/arm/p384/Makefile b/arm/p384/Makefile index d3feb070c7..2390e53e44 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC ############################################################################# -# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise # use a cross-assembling version so that the code can still be assembled # and the proofs checked against the object files (though you won't be able # to run code without additional emulation infrastructure). The aarch64 diff --git a/arm/p521/Makefile b/arm/p521/Makefile index b8ad763c35..9121b81013 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC ############################################################################# -# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise # use a cross-assembling version so that the code can still be assembled # and the proofs checked against the object files (though you won't be able # to run code without additional emulation infrastructure). The aarch64 diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index c44e31724c..950b8dc649 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -431,7 +431,7 @@ edwards25519_scalarmulbase_standard: // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index 00b91fe1aa..db7fa574b5 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -507,7 +507,7 @@ edwards25519_scalarmulbase_alt_standard: // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 35fd7f4ffc..91a27e8cf3 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -1528,7 +1528,7 @@ edwards25519_scalarmuldouble_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. movq cf, %rdi diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index e17d10b47a..42380c036c 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1645,7 +1645,7 @@ edwards25519_scalarmuldouble_alt_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. movq cf, %rdi From 16015b4515fbc1368728a00706d573ebc8d5ecd5 Mon Sep 17 00:00:00 2001 From: Torben Hansen <50673096+torben-hansen@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:18:45 -0800 Subject: [PATCH 08/24] Avoid duplicate labels in ed25519 x86 implementation s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/e6ef86f7ebee4db5dccb351ff7ef7729de6dea42 --- x86_att/curve25519/edwards25519_scalarmuldouble.S | 8 ++++---- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 35fd7f4ffc..7e5fd2b41c 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -2072,8 +2072,8 @@ edwards25519_scalarmuldouble_loop: movq %rax, 0x78(%rsp) movq $0xa, 0x90(%rsp) movq $0x1, 0x98(%rsp) - jmp curve25519_x25519_midloop -curve25519_x25519_inverseloop: + jmp edwards25519_scalarmuldouble_midloop +edwards25519_scalarmuldouble_inverseloop: movq %r8, %r9 sarq $0x3f, %r9 xorq %r9, %r8 @@ -2364,7 +2364,7 @@ curve25519_x25519_inverseloop: shlq $0x3f, %rax addq %rax, %rsi movq %rsi, 0x78(%rsp) -curve25519_x25519_midloop: +edwards25519_scalarmuldouble_midloop: movq 0x98(%rsp), %rsi movq (%rsp), %rdx movq 0x20(%rsp), %rcx @@ -3265,7 +3265,7 @@ curve25519_x25519_midloop: leaq (%rax,%rdx), %r12 movq %rsi, 0x98(%rsp) decq 0x90(%rsp) - jne curve25519_x25519_inverseloop + jne edwards25519_scalarmuldouble_inverseloop movq (%rsp), %rax movq 0x20(%rsp), %rcx imulq %r8, %rax diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index e17d10b47a..4cd5d1e63f 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -2189,8 +2189,8 @@ edwards25519_scalarmuldouble_alt_loop: movq %rax, 0x78(%rsp) movq $0xa, 0x90(%rsp) movq $0x1, 0x98(%rsp) - jmp curve25519_x25519_midloop -curve25519_x25519_inverseloop: + jmp edwards25519_scalarmuldouble_alt_midloop +edwards25519_scalarmuldouble_alt_inverseloop: movq %r8, %r9 sarq $0x3f, %r9 xorq %r9, %r8 @@ -2481,7 +2481,7 @@ curve25519_x25519_inverseloop: shlq $0x3f, %rax addq %rax, %rsi movq %rsi, 0x78(%rsp) -curve25519_x25519_midloop: +edwards25519_scalarmuldouble_alt_midloop: movq 0x98(%rsp), %rsi movq (%rsp), %rdx movq 0x20(%rsp), %rcx @@ -3382,7 +3382,7 @@ curve25519_x25519_midloop: leaq (%rax,%rdx), %r12 movq %rsi, 0x98(%rsp) decq 0x90(%rsp) - jne curve25519_x25519_inverseloop + jne edwards25519_scalarmuldouble_alt_inverseloop movq (%rsp), %rax movq 0x20(%rsp), %rcx imulq %r8, %rax From 0694a87f5711cf3db0f12b06474dff23982be6e1 Mon Sep 17 00:00:00 2001 From: Torben Hansen <50673096+torben-hansen@users.noreply.github.com> Date: Mon, 27 Nov 2023 10:50:22 -0800 Subject: [PATCH 09/24] Make parameter to ed25519 decode function const s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/d93670d3d7ddb23cd4059ac519038e7796cd3d45 --- arm/curve25519/edwards25519_decode.S | 2 +- arm/curve25519/edwards25519_decode_alt.S | 2 +- x86_att/curve25519/edwards25519_decode.S | 2 +- x86_att/curve25519/edwards25519_decode_alt.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S index 9161768db7..653689be94 100644 --- a/arm/curve25519/edwards25519_decode.S +++ b/arm/curve25519/edwards25519_decode.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S index c77a191744..a8e842f15a 100644 --- a/arm/curve25519/edwards25519_decode_alt.S +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S index 05681925a3..24431ef564 100644 --- a/x86_att/curve25519/edwards25519_decode.S +++ b/x86_att/curve25519/edwards25519_decode.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S index 570b2f9081..c7854380e1 100644 --- a/x86_att/curve25519/edwards25519_decode_alt.S +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as From ce316eb71c952561c98a6c20e1095f17a02aeebd Mon Sep 17 00:00:00 2001 From: jargh <78765052+jargh@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:20:41 -0800 Subject: [PATCH 10/24] Allow MIT-0 license as well as Apache-2.0 and ISC (#104) * Allow MIT-0 license as well as Apache-2.0 and ISC * Add appropriate year range to MIT-0 license s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/48fb153e097894a90f84defe913fd1a572cb7900 --- arm/curve25519/bignum_mod_n25519.S | 2 +- arm/curve25519/bignum_neg_p25519.S | 2 +- arm/curve25519/curve25519_x25519.S | 2 +- arm/curve25519/curve25519_x25519_alt.S | 2 +- arm/curve25519/curve25519_x25519_byte.S | 2 +- arm/curve25519/curve25519_x25519_byte_alt.S | 2 +- arm/curve25519/curve25519_x25519base.S | 2 +- arm/curve25519/curve25519_x25519base_alt.S | 2 +- arm/curve25519/curve25519_x25519base_byte.S | 2 +- arm/curve25519/curve25519_x25519base_byte_alt.S | 2 +- arm/curve25519/edwards25519_decode.S | 2 +- arm/curve25519/edwards25519_decode_alt.S | 2 +- arm/curve25519/edwards25519_encode.S | 2 +- arm/curve25519/edwards25519_scalarmulbase.S | 2 +- arm/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- arm/fastmul/bignum_emontredc_8n.S | 2 +- arm/fastmul/bignum_kmul_16_32.S | 2 +- arm/fastmul/bignum_kmul_32_64.S | 2 +- arm/fastmul/bignum_ksqr_16_32.S | 2 +- arm/fastmul/bignum_ksqr_32_64.S | 2 +- arm/generic/bignum_copy_row_from_table.S | 2 +- arm/generic/bignum_copy_row_from_table_8n_neon.S | 2 +- arm/generic/bignum_ge.S | 2 +- arm/generic/bignum_mul.S | 2 +- arm/generic/bignum_optsub.S | 2 +- arm/generic/bignum_sqr.S | 2 +- arm/p384/Makefile | 2 +- arm/p384/bignum_add_p384.S | 2 +- arm/p384/bignum_bigendian_6.S | 2 +- arm/p384/bignum_cmul_p384.S | 2 +- arm/p384/bignum_deamont_p384.S | 2 +- arm/p384/bignum_demont_p384.S | 2 +- arm/p384/bignum_double_p384.S | 2 +- arm/p384/bignum_half_p384.S | 2 +- arm/p384/bignum_littleendian_6.S | 2 +- arm/p384/bignum_mod_n384.S | 2 +- arm/p384/bignum_mod_n384_6.S | 2 +- arm/p384/bignum_mod_p384.S | 2 +- arm/p384/bignum_mod_p384_6.S | 2 +- arm/p384/bignum_montmul_p384.S | 2 +- arm/p384/bignum_montmul_p384_alt.S | 2 +- arm/p384/bignum_montsqr_p384.S | 2 +- arm/p384/bignum_montsqr_p384_alt.S | 2 +- arm/p384/bignum_mux_6.S | 2 +- arm/p384/bignum_neg_p384.S | 2 +- arm/p384/bignum_nonzero_6.S | 2 +- arm/p384/bignum_optneg_p384.S | 2 +- arm/p384/bignum_sub_p384.S | 2 +- arm/p384/bignum_tomont_p384.S | 2 +- arm/p384/bignum_triple_p384.S | 2 +- arm/p384/p384_montjadd.S | 2 +- arm/p384/p384_montjdouble.S | 2 +- arm/p384/p384_montjmixadd.S | 2 +- arm/p521/Makefile | 2 +- arm/p521/bignum_add_p521.S | 2 +- arm/p521/bignum_cmul_p521.S | 2 +- arm/p521/bignum_deamont_p521.S | 2 +- arm/p521/bignum_demont_p521.S | 2 +- arm/p521/bignum_double_p521.S | 2 +- arm/p521/bignum_fromlebytes_p521.S | 2 +- arm/p521/bignum_half_p521.S | 2 +- arm/p521/bignum_mod_n521_9.S | 2 +- arm/p521/bignum_mod_p521_9.S | 2 +- arm/p521/bignum_montmul_p521.S | 2 +- arm/p521/bignum_montmul_p521_alt.S | 2 +- arm/p521/bignum_montsqr_p521.S | 2 +- arm/p521/bignum_montsqr_p521_alt.S | 2 +- arm/p521/bignum_mul_p521.S | 2 +- arm/p521/bignum_mul_p521_alt.S | 2 +- arm/p521/bignum_neg_p521.S | 2 +- arm/p521/bignum_optneg_p521.S | 2 +- arm/p521/bignum_sqr_p521.S | 2 +- arm/p521/bignum_sqr_p521_alt.S | 2 +- arm/p521/bignum_sub_p521.S | 2 +- arm/p521/bignum_tolebytes_p521.S | 2 +- arm/p521/bignum_tomont_p521.S | 2 +- arm/p521/bignum_triple_p521.S | 2 +- arm/p521/p521_jadd.S | 2 +- arm/p521/p521_jdouble.S | 2 +- arm/p521/p521_jmixadd.S | 2 +- x86_att/curve25519/bignum_mod_n25519.S | 2 +- x86_att/curve25519/bignum_neg_p25519.S | 2 +- x86_att/curve25519/curve25519_x25519.S | 2 +- x86_att/curve25519/curve25519_x25519_alt.S | 2 +- x86_att/curve25519/curve25519_x25519base.S | 2 +- x86_att/curve25519/curve25519_x25519base_alt.S | 2 +- x86_att/curve25519/edwards25519_decode.S | 2 +- x86_att/curve25519/edwards25519_decode_alt.S | 2 +- x86_att/curve25519/edwards25519_encode.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- x86_att/p384/bignum_add_p384.S | 2 +- x86_att/p384/bignum_bigendian_6.S | 2 +- x86_att/p384/bignum_cmul_p384.S | 2 +- x86_att/p384/bignum_cmul_p384_alt.S | 2 +- x86_att/p384/bignum_deamont_p384.S | 2 +- x86_att/p384/bignum_deamont_p384_alt.S | 2 +- x86_att/p384/bignum_demont_p384.S | 2 +- x86_att/p384/bignum_demont_p384_alt.S | 2 +- x86_att/p384/bignum_double_p384.S | 2 +- x86_att/p384/bignum_half_p384.S | 2 +- x86_att/p384/bignum_littleendian_6.S | 2 +- x86_att/p384/bignum_mod_n384.S | 2 +- x86_att/p384/bignum_mod_n384_6.S | 2 +- x86_att/p384/bignum_mod_n384_alt.S | 2 +- x86_att/p384/bignum_mod_p384.S | 2 +- x86_att/p384/bignum_mod_p384_6.S | 2 +- x86_att/p384/bignum_mod_p384_alt.S | 2 +- x86_att/p384/bignum_montmul_p384.S | 2 +- x86_att/p384/bignum_montmul_p384_alt.S | 2 +- x86_att/p384/bignum_montsqr_p384.S | 2 +- x86_att/p384/bignum_montsqr_p384_alt.S | 2 +- x86_att/p384/bignum_mux_6.S | 2 +- x86_att/p384/bignum_neg_p384.S | 2 +- x86_att/p384/bignum_nonzero_6.S | 2 +- x86_att/p384/bignum_optneg_p384.S | 2 +- x86_att/p384/bignum_sub_p384.S | 2 +- x86_att/p384/bignum_tomont_p384.S | 2 +- x86_att/p384/bignum_tomont_p384_alt.S | 2 +- x86_att/p384/bignum_triple_p384.S | 2 +- x86_att/p384/bignum_triple_p384_alt.S | 2 +- x86_att/p384/p384_montjadd.S | 2 +- x86_att/p384/p384_montjdouble.S | 2 +- x86_att/p384/p384_montjmixadd.S | 2 +- x86_att/p521/bignum_add_p521.S | 2 +- x86_att/p521/bignum_cmul_p521.S | 2 +- x86_att/p521/bignum_cmul_p521_alt.S | 2 +- x86_att/p521/bignum_deamont_p521.S | 2 +- x86_att/p521/bignum_demont_p521.S | 2 +- x86_att/p521/bignum_double_p521.S | 2 +- x86_att/p521/bignum_fromlebytes_p521.S | 2 +- x86_att/p521/bignum_half_p521.S | 2 +- x86_att/p521/bignum_mod_n521_9.S | 2 +- x86_att/p521/bignum_mod_n521_9_alt.S | 2 +- x86_att/p521/bignum_mod_p521_9.S | 2 +- x86_att/p521/bignum_montmul_p521.S | 2 +- x86_att/p521/bignum_montmul_p521_alt.S | 2 +- x86_att/p521/bignum_montsqr_p521.S | 2 +- x86_att/p521/bignum_montsqr_p521_alt.S | 2 +- x86_att/p521/bignum_mul_p521.S | 2 +- x86_att/p521/bignum_mul_p521_alt.S | 2 +- x86_att/p521/bignum_neg_p521.S | 2 +- x86_att/p521/bignum_optneg_p521.S | 2 +- x86_att/p521/bignum_sqr_p521.S | 2 +- x86_att/p521/bignum_sqr_p521_alt.S | 2 +- x86_att/p521/bignum_sub_p521.S | 2 +- x86_att/p521/bignum_tolebytes_p521.S | 2 +- x86_att/p521/bignum_tomont_p521.S | 2 +- x86_att/p521/bignum_triple_p521.S | 2 +- x86_att/p521/bignum_triple_p521_alt.S | 2 +- x86_att/p521/p521_jadd.S | 2 +- x86_att/p521/p521_jdouble.S | 2 +- x86_att/p521/p521_jmixadd.S | 2 +- 157 files changed, 157 insertions(+), 157 deletions(-) diff --git a/arm/curve25519/bignum_mod_n25519.S b/arm/curve25519/bignum_mod_n25519.S index 5a256ed133..3f8a94c9bb 100644 --- a/arm/curve25519/bignum_mod_n25519.S +++ b/arm/curve25519/bignum_mod_n25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo basepoint order, z := x mod n_25519 diff --git a/arm/curve25519/bignum_neg_p25519.S b/arm/curve25519/bignum_neg_p25519.S index 8466df43c1..e3e85b4ecf 100644 --- a/arm/curve25519/bignum_neg_p25519.S +++ b/arm/curve25519/bignum_neg_p25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 7514dac33a..5aaaaa0f5a 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/arm/curve25519/curve25519_x25519_alt.S b/arm/curve25519/curve25519_x25519_alt.S index 261b82c90a..82de375b14 100644 --- a/arm/curve25519/curve25519_x25519_alt.S +++ b/arm/curve25519/curve25519_x25519_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 7837118421..3e3c03371d 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519_byte_alt.S b/arm/curve25519/curve25519_x25519_byte_alt.S index 6523822d2c..790cb2b030 100644 --- a/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/arm/curve25519/curve25519_x25519_byte_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519base.S b/arm/curve25519/curve25519_x25519base.S index b9c3b8e34a..ef46f7b169 100644 --- a/arm/curve25519/curve25519_x25519base.S +++ b/arm/curve25519/curve25519_x25519base.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/arm/curve25519/curve25519_x25519base_alt.S b/arm/curve25519/curve25519_x25519base_alt.S index 22de69f4c3..702fe6e88a 100644 --- a/arm/curve25519/curve25519_x25519base_alt.S +++ b/arm/curve25519/curve25519_x25519base_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/arm/curve25519/curve25519_x25519base_byte.S b/arm/curve25519/curve25519_x25519base_byte.S index aecc693c66..635729cb77 100644 --- a/arm/curve25519/curve25519_x25519base_byte.S +++ b/arm/curve25519/curve25519_x25519base_byte.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519base_byte_alt.S b/arm/curve25519/curve25519_x25519base_byte_alt.S index 9c9dca518c..39b6bfd172 100644 --- a/arm/curve25519/curve25519_x25519base_byte_alt.S +++ b/arm/curve25519/curve25519_x25519base_byte_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 (byte array arguments) diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S index 653689be94..f565df90fd 100644 --- a/arm/curve25519/edwards25519_decode.S +++ b/arm/curve25519/edwards25519_decode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S index a8e842f15a..befacd2ff0 100644 --- a/arm/curve25519/edwards25519_decode_alt.S +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/arm/curve25519/edwards25519_encode.S b/arm/curve25519/edwards25519_encode.S index 4cf301a227..c0f2e3fc9e 100644 --- a/arm/curve25519/edwards25519_encode.S +++ b/arm/curve25519/edwards25519_encode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Encode edwards25519 point into compressed form as 256-bit number diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 89e98494ac..e00aa7e278 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index e89d58b378..2ffc7799ed 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index d6fc9121f9..d8c6e21c6e 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index 54cebef997..9c3d6db2cb 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/arm/fastmul/bignum_emontredc_8n.S b/arm/fastmul/bignum_emontredc_8n.S index 0876ddea8b..081f5de362 100644 --- a/arm/fastmul/bignum_emontredc_8n.S +++ b/arm/fastmul/bignum_emontredc_8n.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer diff --git a/arm/fastmul/bignum_kmul_16_32.S b/arm/fastmul/bignum_kmul_16_32.S index 2367b69891..e45dd487e1 100644 --- a/arm/fastmul/bignum_kmul_16_32.S +++ b/arm/fastmul/bignum_kmul_16_32.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/fastmul/bignum_kmul_32_64.S b/arm/fastmul/bignum_kmul_32_64.S index 467d298697..e45249462a 100644 --- a/arm/fastmul/bignum_kmul_32_64.S +++ b/arm/fastmul/bignum_kmul_32_64.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/fastmul/bignum_ksqr_16_32.S b/arm/fastmul/bignum_ksqr_16_32.S index bb62a9c0ca..7be2ac6c45 100644 --- a/arm/fastmul/bignum_ksqr_16_32.S +++ b/arm/fastmul/bignum_ksqr_16_32.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square, z := x^2 diff --git a/arm/fastmul/bignum_ksqr_32_64.S b/arm/fastmul/bignum_ksqr_32_64.S index fbd3c47bec..659e00a791 100644 --- a/arm/fastmul/bignum_ksqr_32_64.S +++ b/arm/fastmul/bignum_ksqr_32_64.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square, z := x^2 diff --git a/arm/generic/bignum_copy_row_from_table.S b/arm/generic/bignum_copy_row_from_table.S index ba3e48d061..514df68262 100644 --- a/arm/generic/bignum_copy_row_from_table.S +++ b/arm/generic/bignum_copy_row_from_table.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] diff --git a/arm/generic/bignum_copy_row_from_table_8n_neon.S b/arm/generic/bignum_copy_row_from_table_8n_neon.S index e17ebceeff..b065a70525 100644 --- a/arm/generic/bignum_copy_row_from_table_8n_neon.S +++ b/arm/generic/bignum_copy_row_from_table_8n_neon.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] diff --git a/arm/generic/bignum_ge.S b/arm/generic/bignum_ge.S index a646b47d43..5ba0b8eda9 100644 --- a/arm/generic/bignum_ge.S +++ b/arm/generic/bignum_ge.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Compare bignums, x >= y diff --git a/arm/generic/bignum_mul.S b/arm/generic/bignum_mul.S index 1da4bf9516..f02665c36b 100644 --- a/arm/generic/bignum_mul.S +++ b/arm/generic/bignum_mul.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/generic/bignum_optsub.S b/arm/generic/bignum_optsub.S index 285536ef74..e696198fc4 100644 --- a/arm/generic/bignum_optsub.S +++ b/arm/generic/bignum_optsub.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) diff --git a/arm/generic/bignum_sqr.S b/arm/generic/bignum_sqr.S index 1a75dbddbb..2305cce102 100644 --- a/arm/generic/bignum_sqr.S +++ b/arm/generic/bignum_sqr.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square z := x^2 diff --git a/arm/p384/Makefile b/arm/p384/Makefile index 2390e53e44..564b9dd93c 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -1,6 +1,6 @@ ############################################################################# # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 OR ISC +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 ############################################################################# # If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise diff --git a/arm/p384/bignum_add_p384.S b/arm/p384/bignum_add_p384.S index 00c8e81d31..ad7f2c6b7b 100644 --- a/arm/p384/bignum_add_p384.S +++ b/arm/p384/bignum_add_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced diff --git a/arm/p384/bignum_bigendian_6.S b/arm/p384/bignum_bigendian_6.S index 664ae845dd..cb103d691c 100644 --- a/arm/p384/bignum_bigendian_6.S +++ b/arm/p384/bignum_bigendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from big-endian form diff --git a/arm/p384/bignum_cmul_p384.S b/arm/p384/bignum_cmul_p384.S index b9570c7998..74f648b4c5 100644 --- a/arm/p384/bignum_cmul_p384.S +++ b/arm/p384/bignum_cmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/arm/p384/bignum_deamont_p384.S b/arm/p384/bignum_deamont_p384.S index 91ea265a97..1f84a4becf 100644 --- a/arm/p384/bignum_deamont_p384.S +++ b/arm/p384/bignum_deamont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/arm/p384/bignum_demont_p384.S b/arm/p384/bignum_demont_p384.S index c0dd331d64..1b09517288 100644 --- a/arm/p384/bignum_demont_p384.S +++ b/arm/p384/bignum_demont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_double_p384.S b/arm/p384/bignum_double_p384.S index fce40a0ff1..07b1a57f20 100644 --- a/arm/p384/bignum_double_p384.S +++ b/arm/p384/bignum_double_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_half_p384.S b/arm/p384/bignum_half_p384.S index e3a7ff0e77..c023542b1b 100644 --- a/arm/p384/bignum_half_p384.S +++ b/arm/p384/bignum_half_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_littleendian_6.S b/arm/p384/bignum_littleendian_6.S index 66b0424a51..f325456298 100644 --- a/arm/p384/bignum_littleendian_6.S +++ b/arm/p384/bignum_littleendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from little-endian form diff --git a/arm/p384/bignum_mod_n384.S b/arm/p384/bignum_mod_n384.S index e8de84d4cb..a91bb2c5b5 100644 --- a/arm/p384/bignum_mod_n384.S +++ b/arm/p384/bignum_mod_n384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/arm/p384/bignum_mod_n384_6.S b/arm/p384/bignum_mod_n384_6.S index c382e642ca..e79ad3fe85 100644 --- a/arm/p384/bignum_mod_n384_6.S +++ b/arm/p384/bignum_mod_n384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/arm/p384/bignum_mod_p384.S b/arm/p384/bignum_mod_p384.S index c2ab35526f..cf7f1d6bbb 100644 --- a/arm/p384/bignum_mod_p384.S +++ b/arm/p384/bignum_mod_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/arm/p384/bignum_mod_p384_6.S b/arm/p384/bignum_mod_p384_6.S index a1ac615b1a..959dc86239 100644 --- a/arm/p384/bignum_mod_p384_6.S +++ b/arm/p384/bignum_mod_p384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/arm/p384/bignum_montmul_p384.S b/arm/p384/bignum_montmul_p384.S index 554081f39e..05c3d1786a 100644 --- a/arm/p384/bignum_montmul_p384.S +++ b/arm/p384/bignum_montmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/arm/p384/bignum_montmul_p384_alt.S b/arm/p384/bignum_montmul_p384_alt.S index 2bd28cfffa..a6464f07cc 100644 --- a/arm/p384/bignum_montmul_p384_alt.S +++ b/arm/p384/bignum_montmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/arm/p384/bignum_montsqr_p384.S b/arm/p384/bignum_montsqr_p384.S index 1067bf1a78..fd55c1bf02 100644 --- a/arm/p384/bignum_montsqr_p384.S +++ b/arm/p384/bignum_montsqr_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/arm/p384/bignum_montsqr_p384_alt.S b/arm/p384/bignum_montsqr_p384_alt.S index e4fe2f7f5b..f49830d21e 100644 --- a/arm/p384/bignum_montsqr_p384_alt.S +++ b/arm/p384/bignum_montsqr_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/arm/p384/bignum_mux_6.S b/arm/p384/bignum_mux_6.S index b4c966609f..21d1769949 100644 --- a/arm/p384/bignum_mux_6.S +++ b/arm/p384/bignum_mux_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) diff --git a/arm/p384/bignum_neg_p384.S b/arm/p384/bignum_neg_p384.S index 24bdbb1b23..186d50e881 100644 --- a/arm/p384/bignum_neg_p384.S +++ b/arm/p384/bignum_neg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_nonzero_6.S b/arm/p384/bignum_nonzero_6.S index ae003186b8..b98fe9d863 100644 --- a/arm/p384/bignum_nonzero_6.S +++ b/arm/p384/bignum_nonzero_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero diff --git a/arm/p384/bignum_optneg_p384.S b/arm/p384/bignum_optneg_p384.S index 7b5e704348..325fccbcf4 100644 --- a/arm/p384/bignum_optneg_p384.S +++ b/arm/p384/bignum_optneg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or diff --git a/arm/p384/bignum_sub_p384.S b/arm/p384/bignum_sub_p384.S index bd7a9deeff..1e5085628b 100644 --- a/arm/p384/bignum_sub_p384.S +++ b/arm/p384/bignum_sub_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_384, z := (x - y) mod p_384 diff --git a/arm/p384/bignum_tomont_p384.S b/arm/p384/bignum_tomont_p384.S index efed55f8c0..c666f5e78f 100644 --- a/arm/p384/bignum_tomont_p384.S +++ b/arm/p384/bignum_tomont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/arm/p384/bignum_triple_p384.S b/arm/p384/bignum_triple_p384.S index cc641a2eeb..d129b8712f 100644 --- a/arm/p384/bignum_triple_p384.S +++ b/arm/p384/bignum_triple_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S index 98f40b0a80..9c0e1ecb99 100644 --- a/arm/p384/p384_montjadd.S +++ b/arm/p384/p384_montjadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p384/p384_montjdouble.S b/arm/p384/p384_montjdouble.S index 5b4a609b59..7dfd9766f2 100644 --- a/arm/p384/p384_montjdouble.S +++ b/arm/p384/p384_montjdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S index 0f5c24203f..1b0165ab8c 100644 --- a/arm/p384/p384_montjmixadd.S +++ b/arm/p384/p384_montjmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 9121b81013..ae0d4f8d70 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -1,6 +1,6 @@ ############################################################################# # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 OR ISC +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 ############################################################################# # If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise diff --git a/arm/p521/bignum_add_p521.S b/arm/p521/bignum_add_p521.S index d9d59bbd48..248db96ef2 100644 --- a/arm/p521/bignum_add_p521.S +++ b/arm/p521/bignum_add_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_cmul_p521.S b/arm/p521/bignum_cmul_p521.S index 0b657b8b73..00f9cf0be5 100644 --- a/arm/p521/bignum_cmul_p521.S +++ b/arm/p521/bignum_cmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/arm/p521/bignum_deamont_p521.S b/arm/p521/bignum_deamont_p521.S index 442e5d4048..83849147f8 100644 --- a/arm/p521/bignum_deamont_p521.S +++ b/arm/p521/bignum_deamont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521 diff --git a/arm/p521/bignum_demont_p521.S b/arm/p521/bignum_demont_p521.S index d3004ec580..1b48113e01 100644 --- a/arm/p521/bignum_demont_p521.S +++ b/arm/p521/bignum_demont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_double_p521.S b/arm/p521/bignum_double_p521.S index 8d0e291120..ecfdcf2f74 100644 --- a/arm/p521/bignum_double_p521.S +++ b/arm/p521/bignum_double_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_fromlebytes_p521.S b/arm/p521/bignum_fromlebytes_p521.S index 7a87ed3338..fd0d8ca362 100644 --- a/arm/p521/bignum_fromlebytes_p521.S +++ b/arm/p521/bignum_fromlebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert little-endian bytes to 9-digit 528-bit bignum diff --git a/arm/p521/bignum_half_p521.S b/arm/p521/bignum_half_p521.S index 1f8da155ba..757156b266 100644 --- a/arm/p521/bignum_half_p521.S +++ b/arm/p521/bignum_half_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_mod_n521_9.S b/arm/p521/bignum_mod_n521_9.S index 65bc4f08bb..d680e5f1db 100644 --- a/arm/p521/bignum_mod_n521_9.S +++ b/arm/p521/bignum_mod_n521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/arm/p521/bignum_mod_p521_9.S b/arm/p521/bignum_mod_p521_9.S index 874e9df091..56385905ac 100644 --- a/arm/p521/bignum_mod_p521_9.S +++ b/arm/p521/bignum_mod_p521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_521 diff --git a/arm/p521/bignum_montmul_p521.S b/arm/p521/bignum_montmul_p521.S index c0ac8cf926..e1ea8dc0c2 100644 --- a/arm/p521/bignum_montmul_p521.S +++ b/arm/p521/bignum_montmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/arm/p521/bignum_montmul_p521_alt.S b/arm/p521/bignum_montmul_p521_alt.S index 6b0afeac1d..8c302ce1f8 100644 --- a/arm/p521/bignum_montmul_p521_alt.S +++ b/arm/p521/bignum_montmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/arm/p521/bignum_montsqr_p521.S b/arm/p521/bignum_montsqr_p521.S index 45e57a666e..2c8dbd789f 100644 --- a/arm/p521/bignum_montsqr_p521.S +++ b/arm/p521/bignum_montsqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/arm/p521/bignum_montsqr_p521_alt.S b/arm/p521/bignum_montsqr_p521_alt.S index 1ae774f0d3..1376cf8eb7 100644 --- a/arm/p521/bignum_montsqr_p521_alt.S +++ b/arm/p521/bignum_montsqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/arm/p521/bignum_mul_p521.S b/arm/p521/bignum_mul_p521.S index 12594faf9a..97859d6bbe 100644 --- a/arm/p521/bignum_mul_p521.S +++ b/arm/p521/bignum_mul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_mul_p521_alt.S b/arm/p521/bignum_mul_p521_alt.S index d0c2cdb0e6..ea39156aaa 100644 --- a/arm/p521/bignum_mul_p521_alt.S +++ b/arm/p521/bignum_mul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_neg_p521.S b/arm/p521/bignum_neg_p521.S index cdf7a9641c..488f3660b0 100644 --- a/arm/p521/bignum_neg_p521.S +++ b/arm/p521/bignum_neg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_optneg_p521.S b/arm/p521/bignum_optneg_p521.S index 74fac18e5a..8c5dfda4db 100644 --- a/arm/p521/bignum_optneg_p521.S +++ b/arm/p521/bignum_optneg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or diff --git a/arm/p521/bignum_sqr_p521.S b/arm/p521/bignum_sqr_p521.S index 23f8a3b9b2..404665258c 100644 --- a/arm/p521/bignum_sqr_p521.S +++ b/arm/p521/bignum_sqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_sqr_p521_alt.S b/arm/p521/bignum_sqr_p521_alt.S index 7837b23a3d..439dd2e7e6 100644 --- a/arm/p521/bignum_sqr_p521_alt.S +++ b/arm/p521/bignum_sqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_sub_p521.S b/arm/p521/bignum_sub_p521.S index 4cc4e830b5..8ff430d500 100644 --- a/arm/p521/bignum_sub_p521.S +++ b/arm/p521/bignum_sub_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_521, z := (x - y) mod p_521 diff --git a/arm/p521/bignum_tolebytes_p521.S b/arm/p521/bignum_tolebytes_p521.S index 403f8fbd64..b1c4b3eaf1 100644 --- a/arm/p521/bignum_tolebytes_p521.S +++ b/arm/p521/bignum_tolebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 9-digit 528-bit bignum to little-endian bytes diff --git a/arm/p521/bignum_tomont_p521.S b/arm/p521/bignum_tomont_p521.S index 833c07b847..c94cd12ca0 100644 --- a/arm/p521/bignum_tomont_p521.S +++ b/arm/p521/bignum_tomont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^576 * x) mod p_521 diff --git a/arm/p521/bignum_triple_p521.S b/arm/p521/bignum_triple_p521.S index 7ce5d00915..961df99351 100644 --- a/arm/p521/bignum_triple_p521.S +++ b/arm/p521/bignum_triple_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S index 928d7ea6cc..1d6b196c8c 100644 --- a/arm/p521/p521_jadd.S +++ b/arm/p521/p521_jadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-521 in Jacobian coordinates diff --git a/arm/p521/p521_jdouble.S b/arm/p521/p521_jdouble.S index 6794e4cd92..100f6d3e87 100644 --- a/arm/p521/p521_jdouble.S +++ b/arm/p521/p521_jdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-521 in Jacobian coordinates diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S index cd27d24eb8..c9b62a9aa1 100644 --- a/arm/p521/p521_jmixadd.S +++ b/arm/p521/p521_jmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/curve25519/bignum_mod_n25519.S b/x86_att/curve25519/bignum_mod_n25519.S index c45d99b541..52f8bfdd57 100644 --- a/x86_att/curve25519/bignum_mod_n25519.S +++ b/x86_att/curve25519/bignum_mod_n25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo basepoint order, z := x mod n_25519 diff --git a/x86_att/curve25519/bignum_neg_p25519.S b/x86_att/curve25519/bignum_neg_p25519.S index 02d01b1241..5e66073baf 100644 --- a/x86_att/curve25519/bignum_neg_p25519.S +++ b/x86_att/curve25519/bignum_neg_p25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced diff --git a/x86_att/curve25519/curve25519_x25519.S b/x86_att/curve25519/curve25519_x25519.S index b46c522b36..87e5e9cf62 100644 --- a/x86_att/curve25519/curve25519_x25519.S +++ b/x86_att/curve25519/curve25519_x25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/x86_att/curve25519/curve25519_x25519_alt.S b/x86_att/curve25519/curve25519_x25519_alt.S index dd644dbba9..4a63a55f11 100644 --- a/x86_att/curve25519/curve25519_x25519_alt.S +++ b/x86_att/curve25519/curve25519_x25519_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/x86_att/curve25519/curve25519_x25519base.S b/x86_att/curve25519/curve25519_x25519base.S index e450656861..dda3b1707b 100644 --- a/x86_att/curve25519/curve25519_x25519base.S +++ b/x86_att/curve25519/curve25519_x25519base.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/x86_att/curve25519/curve25519_x25519base_alt.S b/x86_att/curve25519/curve25519_x25519base_alt.S index b1275e2084..b6c82faba0 100644 --- a/x86_att/curve25519/curve25519_x25519base_alt.S +++ b/x86_att/curve25519/curve25519_x25519base_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S index 24431ef564..ae63e0dacb 100644 --- a/x86_att/curve25519/edwards25519_decode.S +++ b/x86_att/curve25519/edwards25519_decode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S index c7854380e1..8bfe721253 100644 --- a/x86_att/curve25519/edwards25519_decode_alt.S +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/x86_att/curve25519/edwards25519_encode.S b/x86_att/curve25519/edwards25519_encode.S index bdbaa47232..13b0102d09 100644 --- a/x86_att/curve25519/edwards25519_encode.S +++ b/x86_att/curve25519/edwards25519_encode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Encode edwards25519 point into compressed form as 256-bit number diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index 950b8dc649..6b2a80c728 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index db7fa574b5..4796e72189 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index eabdcd461b..993c420e05 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index b285d57ff5..e7c8f7a59d 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/x86_att/p384/bignum_add_p384.S b/x86_att/p384/bignum_add_p384.S index b0a3c9c517..94293e4e70 100644 --- a/x86_att/p384/bignum_add_p384.S +++ b/x86_att/p384/bignum_add_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced diff --git a/x86_att/p384/bignum_bigendian_6.S b/x86_att/p384/bignum_bigendian_6.S index 7fa59c536e..0a23e35659 100644 --- a/x86_att/p384/bignum_bigendian_6.S +++ b/x86_att/p384/bignum_bigendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from big-endian form diff --git a/x86_att/p384/bignum_cmul_p384.S b/x86_att/p384/bignum_cmul_p384.S index 6632a9ae7e..76f6795087 100644 --- a/x86_att/p384/bignum_cmul_p384.S +++ b/x86_att/p384/bignum_cmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/x86_att/p384/bignum_cmul_p384_alt.S b/x86_att/p384/bignum_cmul_p384_alt.S index c91629cd30..2e21e64615 100644 --- a/x86_att/p384/bignum_cmul_p384_alt.S +++ b/x86_att/p384/bignum_cmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/x86_att/p384/bignum_deamont_p384.S b/x86_att/p384/bignum_deamont_p384.S index 6b7daea25e..9edb4ab610 100644 --- a/x86_att/p384/bignum_deamont_p384.S +++ b/x86_att/p384/bignum_deamont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_deamont_p384_alt.S b/x86_att/p384/bignum_deamont_p384_alt.S index 918a104f63..c0e6096bdd 100644 --- a/x86_att/p384/bignum_deamont_p384_alt.S +++ b/x86_att/p384/bignum_deamont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_demont_p384.S b/x86_att/p384/bignum_demont_p384.S index 3dc1d734c4..36a5ef0078 100644 --- a/x86_att/p384/bignum_demont_p384.S +++ b/x86_att/p384/bignum_demont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_demont_p384_alt.S b/x86_att/p384/bignum_demont_p384_alt.S index d2dca9c4f2..adccd962e7 100644 --- a/x86_att/p384/bignum_demont_p384_alt.S +++ b/x86_att/p384/bignum_demont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_double_p384.S b/x86_att/p384/bignum_double_p384.S index c06b218889..7e0c35dab3 100644 --- a/x86_att/p384/bignum_double_p384.S +++ b/x86_att/p384/bignum_double_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_half_p384.S b/x86_att/p384/bignum_half_p384.S index 51afea03bb..a3e3954173 100644 --- a/x86_att/p384/bignum_half_p384.S +++ b/x86_att/p384/bignum_half_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_littleendian_6.S b/x86_att/p384/bignum_littleendian_6.S index a0eef1f00f..fe5744a86e 100644 --- a/x86_att/p384/bignum_littleendian_6.S +++ b/x86_att/p384/bignum_littleendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from little-endian form diff --git a/x86_att/p384/bignum_mod_n384.S b/x86_att/p384/bignum_mod_n384.S index 963873f72e..169a136ea3 100644 --- a/x86_att/p384/bignum_mod_n384.S +++ b/x86_att/p384/bignum_mod_n384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_n384_6.S b/x86_att/p384/bignum_mod_n384_6.S index 273bce8b33..6b68c2a444 100644 --- a/x86_att/p384/bignum_mod_n384_6.S +++ b/x86_att/p384/bignum_mod_n384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_n384_alt.S b/x86_att/p384/bignum_mod_n384_alt.S index ffd9c9d1b9..92282a83a7 100644 --- a/x86_att/p384/bignum_mod_n384_alt.S +++ b/x86_att/p384/bignum_mod_n384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_p384.S b/x86_att/p384/bignum_mod_p384.S index 10414fea41..c9caf41c83 100644 --- a/x86_att/p384/bignum_mod_p384.S +++ b/x86_att/p384/bignum_mod_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_mod_p384_6.S b/x86_att/p384/bignum_mod_p384_6.S index 08381a6c1e..7196a76f31 100644 --- a/x86_att/p384/bignum_mod_p384_6.S +++ b/x86_att/p384/bignum_mod_p384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_mod_p384_alt.S b/x86_att/p384/bignum_mod_p384_alt.S index 689f1d340c..79da7842a6 100644 --- a/x86_att/p384/bignum_mod_p384_alt.S +++ b/x86_att/p384/bignum_mod_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_montmul_p384.S b/x86_att/p384/bignum_montmul_p384.S index 718991aac1..105efac610 100644 --- a/x86_att/p384/bignum_montmul_p384.S +++ b/x86_att/p384/bignum_montmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montmul_p384_alt.S b/x86_att/p384/bignum_montmul_p384_alt.S index 3da172840e..5a8b4905d9 100644 --- a/x86_att/p384/bignum_montmul_p384_alt.S +++ b/x86_att/p384/bignum_montmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montsqr_p384.S b/x86_att/p384/bignum_montsqr_p384.S index f8b4230b7e..0d0b36013a 100644 --- a/x86_att/p384/bignum_montsqr_p384.S +++ b/x86_att/p384/bignum_montsqr_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montsqr_p384_alt.S b/x86_att/p384/bignum_montsqr_p384_alt.S index e04807766c..061ef6181d 100644 --- a/x86_att/p384/bignum_montsqr_p384_alt.S +++ b/x86_att/p384/bignum_montsqr_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_mux_6.S b/x86_att/p384/bignum_mux_6.S index 5277428379..cb4c2ca503 100644 --- a/x86_att/p384/bignum_mux_6.S +++ b/x86_att/p384/bignum_mux_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) diff --git a/x86_att/p384/bignum_neg_p384.S b/x86_att/p384/bignum_neg_p384.S index 51b0f41bb1..746c01286a 100644 --- a/x86_att/p384/bignum_neg_p384.S +++ b/x86_att/p384/bignum_neg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_nonzero_6.S b/x86_att/p384/bignum_nonzero_6.S index 8e17207d4a..7fdb6bab06 100644 --- a/x86_att/p384/bignum_nonzero_6.S +++ b/x86_att/p384/bignum_nonzero_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero diff --git a/x86_att/p384/bignum_optneg_p384.S b/x86_att/p384/bignum_optneg_p384.S index cee7be2f3c..0a8b247e5d 100644 --- a/x86_att/p384/bignum_optneg_p384.S +++ b/x86_att/p384/bignum_optneg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or diff --git a/x86_att/p384/bignum_sub_p384.S b/x86_att/p384/bignum_sub_p384.S index 8d4ae986a2..5914f4ae9c 100644 --- a/x86_att/p384/bignum_sub_p384.S +++ b/x86_att/p384/bignum_sub_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_384, z := (x - y) mod p_384 diff --git a/x86_att/p384/bignum_tomont_p384.S b/x86_att/p384/bignum_tomont_p384.S index 70463c73a6..66503a2ec4 100644 --- a/x86_att/p384/bignum_tomont_p384.S +++ b/x86_att/p384/bignum_tomont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/x86_att/p384/bignum_tomont_p384_alt.S b/x86_att/p384/bignum_tomont_p384_alt.S index 75ba90d7f7..725713d341 100644 --- a/x86_att/p384/bignum_tomont_p384_alt.S +++ b/x86_att/p384/bignum_tomont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/x86_att/p384/bignum_triple_p384.S b/x86_att/p384/bignum_triple_p384.S index 2d3ae66bf7..52b70f6bea 100644 --- a/x86_att/p384/bignum_triple_p384.S +++ b/x86_att/p384/bignum_triple_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/x86_att/p384/bignum_triple_p384_alt.S b/x86_att/p384/bignum_triple_p384_alt.S index 91efffbe1e..bdbf7e8f6d 100644 --- a/x86_att/p384/bignum_triple_p384_alt.S +++ b/x86_att/p384/bignum_triple_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/x86_att/p384/p384_montjadd.S b/x86_att/p384/p384_montjadd.S index 52b86b2063..27b58bfc14 100644 --- a/x86_att/p384/p384_montjadd.S +++ b/x86_att/p384/p384_montjadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p384/p384_montjdouble.S b/x86_att/p384/p384_montjdouble.S index 80e0b6cc88..b51d24f931 100644 --- a/x86_att/p384/p384_montjdouble.S +++ b/x86_att/p384/p384_montjdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S index 8a8c17c1a0..0d456464b9 100644 --- a/x86_att/p384/p384_montjmixadd.S +++ b/x86_att/p384/p384_montjmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p521/bignum_add_p521.S b/x86_att/p521/bignum_add_p521.S index 849a740971..b046828d45 100644 --- a/x86_att/p521/bignum_add_p521.S +++ b/x86_att/p521/bignum_add_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_cmul_p521.S b/x86_att/p521/bignum_cmul_p521.S index 7898293c6a..fbfc3063fd 100644 --- a/x86_att/p521/bignum_cmul_p521.S +++ b/x86_att/p521/bignum_cmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/x86_att/p521/bignum_cmul_p521_alt.S b/x86_att/p521/bignum_cmul_p521_alt.S index c5f79a8189..fd6986f232 100644 --- a/x86_att/p521/bignum_cmul_p521_alt.S +++ b/x86_att/p521/bignum_cmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/x86_att/p521/bignum_deamont_p521.S b/x86_att/p521/bignum_deamont_p521.S index d916da1f95..099c0e33fc 100644 --- a/x86_att/p521/bignum_deamont_p521.S +++ b/x86_att/p521/bignum_deamont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_demont_p521.S b/x86_att/p521/bignum_demont_p521.S index 182360406a..ef83448b15 100644 --- a/x86_att/p521/bignum_demont_p521.S +++ b/x86_att/p521/bignum_demont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_double_p521.S b/x86_att/p521/bignum_double_p521.S index f3923d82ce..9322ec0b1a 100644 --- a/x86_att/p521/bignum_double_p521.S +++ b/x86_att/p521/bignum_double_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_fromlebytes_p521.S b/x86_att/p521/bignum_fromlebytes_p521.S index a5c9f491d9..6a80dce3c2 100644 --- a/x86_att/p521/bignum_fromlebytes_p521.S +++ b/x86_att/p521/bignum_fromlebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert little-endian bytes to 9-digit 528-bit bignum diff --git a/x86_att/p521/bignum_half_p521.S b/x86_att/p521/bignum_half_p521.S index 9023beb032..ee8b91a325 100644 --- a/x86_att/p521/bignum_half_p521.S +++ b/x86_att/p521/bignum_half_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_mod_n521_9.S b/x86_att/p521/bignum_mod_n521_9.S index 9dcc73d15f..c7e33f88fd 100644 --- a/x86_att/p521/bignum_mod_n521_9.S +++ b/x86_att/p521/bignum_mod_n521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/x86_att/p521/bignum_mod_n521_9_alt.S b/x86_att/p521/bignum_mod_n521_9_alt.S index 026a97e451..aeb314691a 100644 --- a/x86_att/p521/bignum_mod_n521_9_alt.S +++ b/x86_att/p521/bignum_mod_n521_9_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/x86_att/p521/bignum_mod_p521_9.S b/x86_att/p521/bignum_mod_p521_9.S index 0f2e4267f4..0d67aa3ee2 100644 --- a/x86_att/p521/bignum_mod_p521_9.S +++ b/x86_att/p521/bignum_mod_p521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_521 diff --git a/x86_att/p521/bignum_montmul_p521.S b/x86_att/p521/bignum_montmul_p521.S index 3ee202d458..21d777a655 100644 --- a/x86_att/p521/bignum_montmul_p521.S +++ b/x86_att/p521/bignum_montmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montmul_p521_alt.S b/x86_att/p521/bignum_montmul_p521_alt.S index dcef877ffd..b3d0d7c2c6 100644 --- a/x86_att/p521/bignum_montmul_p521_alt.S +++ b/x86_att/p521/bignum_montmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montsqr_p521.S b/x86_att/p521/bignum_montsqr_p521.S index 91cb9c318d..ede53c627c 100644 --- a/x86_att/p521/bignum_montsqr_p521.S +++ b/x86_att/p521/bignum_montsqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montsqr_p521_alt.S b/x86_att/p521/bignum_montsqr_p521_alt.S index ad071a453b..dccdc33ef5 100644 --- a/x86_att/p521/bignum_montsqr_p521_alt.S +++ b/x86_att/p521/bignum_montsqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_mul_p521.S b/x86_att/p521/bignum_mul_p521.S index 25073f9daf..f96e8417ab 100644 --- a/x86_att/p521/bignum_mul_p521.S +++ b/x86_att/p521/bignum_mul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_mul_p521_alt.S b/x86_att/p521/bignum_mul_p521_alt.S index 3224a86634..f87546928a 100644 --- a/x86_att/p521/bignum_mul_p521_alt.S +++ b/x86_att/p521/bignum_mul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_neg_p521.S b/x86_att/p521/bignum_neg_p521.S index 484c1fca56..9a130b0b30 100644 --- a/x86_att/p521/bignum_neg_p521.S +++ b/x86_att/p521/bignum_neg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_optneg_p521.S b/x86_att/p521/bignum_optneg_p521.S index d2434adb4c..8f4c740b6b 100644 --- a/x86_att/p521/bignum_optneg_p521.S +++ b/x86_att/p521/bignum_optneg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or diff --git a/x86_att/p521/bignum_sqr_p521.S b/x86_att/p521/bignum_sqr_p521.S index b9a718cf9b..4b4748f106 100644 --- a/x86_att/p521/bignum_sqr_p521.S +++ b/x86_att/p521/bignum_sqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_sqr_p521_alt.S b/x86_att/p521/bignum_sqr_p521_alt.S index 58f496e3f0..475d3d3c81 100644 --- a/x86_att/p521/bignum_sqr_p521_alt.S +++ b/x86_att/p521/bignum_sqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_sub_p521.S b/x86_att/p521/bignum_sub_p521.S index 99e0d96cd1..03db019833 100644 --- a/x86_att/p521/bignum_sub_p521.S +++ b/x86_att/p521/bignum_sub_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_521, z := (x - y) mod p_521 diff --git a/x86_att/p521/bignum_tolebytes_p521.S b/x86_att/p521/bignum_tolebytes_p521.S index c5ea2ed539..7f89172569 100644 --- a/x86_att/p521/bignum_tolebytes_p521.S +++ b/x86_att/p521/bignum_tolebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 9-digit 528-bit bignum to little-endian bytes diff --git a/x86_att/p521/bignum_tomont_p521.S b/x86_att/p521/bignum_tomont_p521.S index a97beaccb1..39983c24ba 100644 --- a/x86_att/p521/bignum_tomont_p521.S +++ b/x86_att/p521/bignum_tomont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^576 * x) mod p_521 diff --git a/x86_att/p521/bignum_triple_p521.S b/x86_att/p521/bignum_triple_p521.S index 6703a9cb22..264481ef18 100644 --- a/x86_att/p521/bignum_triple_p521.S +++ b/x86_att/p521/bignum_triple_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_triple_p521_alt.S b/x86_att/p521/bignum_triple_p521_alt.S index 4598d9db87..ecd0798778 100644 --- a/x86_att/p521/bignum_triple_p521_alt.S +++ b/x86_att/p521/bignum_triple_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/p521_jadd.S b/x86_att/p521/p521_jadd.S index 256ba845c4..807a7c5472 100644 --- a/x86_att/p521/p521_jadd.S +++ b/x86_att/p521/p521_jadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/p521/p521_jdouble.S b/x86_att/p521/p521_jdouble.S index fd2a57bbc9..22ccbebd43 100644 --- a/x86_att/p521/p521_jdouble.S +++ b/x86_att/p521/p521_jdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/p521/p521_jmixadd.S b/x86_att/p521/p521_jmixadd.S index 7054905371..702b63f560 100644 --- a/x86_att/p521/p521_jmixadd.S +++ b/x86_att/p521/p521_jmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-521 in Jacobian coordinates From e64899bec30a9a7fe47eb124641cca40df6d4f0a Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 8 Feb 2024 16:00:10 -0800 Subject: [PATCH 11/24] Switch non-alt ARM X25519 to unsaturated code following Lenngren This completely changes the implementation of ARM curve25519_x25519 and curve25519_x25519_byte (not the _alt forms, which remain faster on their target microarchitectures) to a base-25.5 unsaturated version with interleaved integer and SIMD operations, the inner loop closely following Emil Lenngren's implementation described in the paper https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf and available here: https://github.com/Emill/X25519-AArch64 A version of this code was generated by SLOTHY from the reorganized implementation by Abdulrahman, Becker, Kannwischer and Klein here: https://github.com/slothy-optimizer/slothy/blob/main/paper/clean/neon/X25519-AArch64-simple.s as described in the associated paper https://eprint.iacr.org/2022/1303.pdf with some additional annotations for use in the formal proof. The final modular inverse computation reverts to the usual saturated representation and s2n-bignum's divstep-based inverse function. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/fc0b9bf7fb558ea49718317eb0623184d60b6fd6 --- arm/curve25519/curve25519_x25519.S | 2312 +++++++++++++-------- arm/curve25519/curve25519_x25519_byte.S | 2440 ++++++++++++++--------- 2 files changed, 3039 insertions(+), 1713 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 5aaaaa0f5a..28dd2f696a 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -1,6 +1,18 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +// ********************************************************************** +// This code is substantially derived from Emil Lenngren's implementation +// +// https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf +// https://github.com/Emill/X25519-AArch64 +// +// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// +// https://eprint.iacr.org/2022/1303.pdf +// https://github.com/slothy-optimizer/slothy/tree/main/paper +// ********************************************************************** + // ---------------------------------------------------------------------------- // The x25519 function for curve25519 // Inputs scalar[4], point[4]; output res[4] @@ -26,833 +38,1309 @@ .text .balign 4 -// Size of individual field elements - -#define NUMSIZE 32 - -// Stable homes for the input result argument during the whole body -// and other variables that are only needed prior to the modular inverse. - -#define res x23 -#define i x20 -#define swap x21 - -// Pointers to result x coord to be written - -#define resx res, #0 - -// Pointer-offset pairs for temporaries on stack with some aliasing. - -#define scalar sp, #(0*NUMSIZE) +// Pointer-offset pairs for temporaries on stack -#define pointx sp, #(1*NUMSIZE) +#define scalar sp, #0 +#define pointx sp, #32 +#define mask1 sp, #72 +#define mask2 sp, #80 +#define tmpa sp, #88 +#define tmpb sp, #128 +#define xn sp, #128 +#define zn sp, #160 -#define zm sp, #(2*NUMSIZE) -#define sm sp, #(2*NUMSIZE) -#define dpro sp, #(2*NUMSIZE) - -#define sn sp, #(3*NUMSIZE) - -#define dm sp, #(4*NUMSIZE) - -#define zn sp, #(5*NUMSIZE) -#define dn sp, #(5*NUMSIZE) -#define e sp, #(5*NUMSIZE) - -#define dmsn sp, #(6*NUMSIZE) -#define p sp, #(6*NUMSIZE) - -#define xm sp, #(7*NUMSIZE) -#define dnsm sp, #(7*NUMSIZE) -#define spro sp, #(7*NUMSIZE) - -#define d sp, #(8*NUMSIZE) - -#define xn sp, #(9*NUMSIZE) -#define s sp, #(9*NUMSIZE) +#define res sp, #192 +#define i sp, #200 +#define swap sp, #208 // Total size to reserve on the stack -#define NSPACE (10*NUMSIZE) - -// Macro wrapping up the basic field operation bignum_mul_p25519, only -// trivially different from a pure function call to that subroutine. - -#define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// A version of multiplication that only guarantees output < 2 * p_25519. -// This basically skips the +1 and final correction in quotient estimation. - -#define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// Squaring just giving a result < 2 * p_25519, which is done by -// basically skipping the +1 in the quotient estimate and the final -// optional correction. - -#define sqr_4(P0,P1) \ - ldp x10, x11, [P1]; \ - ldp x12, x13, [P1+16]; \ - umull x2, w10, w10; \ - lsr x14, x10, #32; \ - umull x3, w14, w14; \ - umull x14, w10, w14; \ - adds x2, x2, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x3, x3, x14; \ - umull x4, w11, w11; \ - lsr x14, x11, #32; \ - umull x5, w14, w14; \ - umull x14, w11, w14; \ - mul x15, x10, x11; \ - umulh x16, x10, x11; \ - adds x4, x4, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x5, x5, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, x5, xzr; \ - adds x3, x3, x15; \ - adcs x4, x4, x16; \ - adc x5, x5, xzr; \ - umull x6, w12, w12; \ - lsr x14, x12, #32; \ - umull x7, w14, w14; \ - umull x14, w12, w14; \ - adds x6, x6, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x7, x7, x14; \ - umull x8, w13, w13; \ - lsr x14, x13, #32; \ - umull x9, w14, w14; \ - umull x14, w13, w14; \ - mul x15, x12, x13; \ - umulh x16, x12, x13; \ - adds x8, x8, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x9, x9, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x9, x9, xzr; \ - adds x7, x7, x15; \ - adcs x8, x8, x16; \ - adc x9, x9, xzr; \ - subs x10, x10, x12; \ - sbcs x11, x11, x13; \ - csetm x16, cc; \ - eor x10, x10, x16; \ - subs x10, x10, x16; \ - eor x11, x11, x16; \ - sbc x11, x11, x16; \ - adds x6, x6, x4; \ - adcs x7, x7, x5; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - umull x12, w10, w10; \ - lsr x5, x10, #32; \ - umull x13, w5, w5; \ - umull x5, w10, w5; \ - adds x12, x12, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x13, x13, x5; \ - umull x15, w11, w11; \ - lsr x5, x11, #32; \ - umull x14, w5, w5; \ - umull x5, w11, w5; \ - mul x4, x10, x11; \ - umulh x16, x10, x11; \ - adds x15, x15, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x14, x14, x5; \ - adds x4, x4, x4; \ - adcs x16, x16, x16; \ - adc x14, x14, xzr; \ - adds x13, x13, x4; \ - adcs x15, x15, x16; \ - adc x14, x14, xzr; \ - adds x4, x2, x6; \ - adcs x5, x3, x7; \ - adcs x6, x6, x8; \ - adcs x7, x7, x9; \ - csetm x16, cc; \ - subs x4, x4, x12; \ - sbcs x5, x5, x13; \ - sbcs x6, x6, x15; \ - sbcs x7, x7, x14; \ - adcs x8, x8, x16; \ - adc x9, x9, x16; \ - mov x10, #0x26; \ - umull x12, w6, w10; \ - add x12, x12, w2, uxtw; \ - lsr x2, x2, #32; \ - lsr x6, x6, #32; \ - umaddl x6, w6, w10, x2; \ - mov x2, x12; \ - umull x12, w7, w10; \ - add x12, x12, w3, uxtw; \ - lsr x3, x3, #32; \ - lsr x7, x7, #32; \ - umaddl x7, w7, w10, x3; \ - mov x3, x12; \ - umull x12, w8, w10; \ - add x12, x12, w4, uxtw; \ - lsr x4, x4, #32; \ - lsr x8, x8, #32; \ - umaddl x8, w8, w10, x4; \ - mov x4, x12; \ - umull x12, w9, w10; \ - add x12, x12, w5, uxtw; \ - lsr x5, x5, #32; \ - lsr x9, x9, #32; \ - umaddl x9, w9, w10, x5; \ - mov x5, x12; \ - lsr x13, x9, #31; \ - mov x11, #0x13; \ - umull x11, w11, w13; \ - add x2, x2, x11; \ - adds x2, x2, x6, lsl #32; \ - extr x10, x7, x6, #32; \ - adcs x3, x3, x10; \ - extr x10, x8, x7, #32; \ - adcs x4, x4, x10; \ - extr x10, x9, x8, #32; \ - lsl x11, x13, #63; \ - eor x5, x5, x11; \ - adc x5, x5, x10; \ - stp x2, x3, [P0]; \ - stp x4, x5, [P0+16] - -// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. -// This only ensures that the result fits in 4 digits, not that it is reduced -// even w.r.t. double modulus. The result is always correct modulo provided -// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided -// at least one of them is reduced double modulo. - -#define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16] - -// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 - -#define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ - stp x7, x8, [p0+16] - -// Combined z = c * x + y with reduction only < 2 * p_25519 -// where c is initially in the X1 register. It is assumed -// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a -// high mul in the final part. - -#define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ - stp x5, x6, [p0+16] - -// Multiplex: z := if NZ then x else y - -#define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0+16] +#define NSPACE 224 +#define regsave sp, #NSPACE S2N_BN_SYMBOL(curve25519_x25519): -// Save regs and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, #NSPACE +// Save registers and make additional room NSPACE for temporaries. +// We only need to save the low 64-bits of the Q8...Q15 registers +// according to the ABI, so we use a save of the D8...D15 forms. + + sub sp, sp, #NSPACE+160 + stp d8, d9, [regsave+0] + stp d10, d11, [regsave+16] + stp d12, d13, [regsave+32] + stp d14, d15, [regsave+48] + stp x19, x20, [regsave+64] + stp x21, x22, [regsave+80] + stp x23, x24, [regsave+96] + stp x25, x26, [regsave+112] + stp x27, x28, [regsave+128] + stp x29, x30, [regsave+144] // Move the output pointer to a stable place - mov res, x0 + str x0, [res] -// Copy the inputs to the local variables with minimal mangling: -// -// - The scalar is in principle turned into 01xxx...xxx000 but -// in the structure below the special handling of these bits is -// explicit in the main computation; the scalar is just copied. -// -// - The point x coord is reduced mod 2^255 by masking off the -// top bit. In the main loop we only need reduction < 2 * p_25519. +// Copy the scalar to the corresponding local variable while +// mangling it. In principle it becomes 01xxx...xxx000 where +// the xxx are the corresponding bits of the original input +// scalar. We actually don't bother forcing the MSB to zero, +// but rather start the main loop below at 254 instead of 255. ldp x10, x11, [x1] + bic x10, x10, #7 stp x10, x11, [scalar] ldp x12, x13, [x1, #16] + orr x13, x13, #0x4000000000000000 stp x12, x13, [scalar+16] - ldp x10, x11, [x2] - stp x10, x11, [pointx] - ldp x12, x13, [x2, #16] - and x13, x13, #0x7fffffffffffffff +// Discard the MSB of the point X coordinate (this is in +// accordance with the RFC, mod 2^255, *not* 2^255-19). +// Then recode it into the unsaturated base 25.5 form. + + ldp x0, x1, [x2] + ldp x2, x3, [x2, #16] + + lsr x12, x0, #51 + lsr x17, x2, #51 + orr x12, x12, x1, lsl #13 + orr x17, x17, x3, lsl #13 + ubfx x8, x3, #12, #26 + ubfx x9, x3, #38, #25 + ubfx x11, x0, #26, #25 + ubfx x13, x1, #13, #25 + lsr x14, x1, #38 + ubfx x16, x2, #25, #26 + and x10, x0, #0x3ffffff + and x12, x12, #0x3ffffff + and x15, x2, #0x1ffffff + and x17, x17, #0x1ffffff + orr x10, x10, x11, lsl #32 + orr x11, x12, x13, lsl #32 + orr x12, x14, x15, lsl #32 + orr x13, x16, x17, lsl #32 + orr x14, x8, x9, lsl #32 + + stp x10, x11, [pointx+0] stp x12, x13, [pointx+16] + str x14, [pointx+32] + +// Initialize (X2,Z2) = (1,0), the identity (projective point at infinity) + + mov x1, #1 + mov v0.d[0], x1 + mov v2.d[0], xzr + mov v4.d[0], xzr + mov v6.d[0], xzr + mov v8.d[0], xzr + + mov v1.d[0], xzr + mov v3.d[0], xzr + mov v5.d[0], xzr + mov v7.d[0], xzr + mov v9.d[0], xzr + +// Initialize (X3,Z3) = (X,1), projective representation of X + + mov v10.d[0], x10 + mov v12.d[0], x11 + mov v14.d[0], x12 + mov v16.d[0], x13 + mov v18.d[0], x14 + + mov v11.d[0], x1 + mov v13.d[0], xzr + mov v15.d[0], xzr + mov v17.d[0], xzr + mov v19.d[0], xzr + +// Set up some constants used repeatedly in the main loop: +// +// Q31 = 0x1300000013 (two 32-bit copies of 19) +// Q30 = 0x3ffffff0000000003ffffff (two 64-bit copies of 2^26-1) +// Q29 = mask1 = (0x07ffffc,0x07fffffe) +// Q28 = mask2 = (0x07ffffb4,0x07fffffe) -// Initialize with explicit doubling in order to handle set bit 254. -// Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1). -// We use the fact that the point x coordinate is still in registers. -// Since zm = 1 we could do the doubling with an operation count of -// 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth -// the slight complication arising from a different linear combination. - - mov swap, #1 - stp x10, x11, [xm] - stp x12, x13, [xm+16] - stp swap, xzr, [zm] - stp xzr, xzr, [zm+16] - - sub_twice4(d,xm,zm) - add_twice4(s,xm,zm) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - -// The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive). -// This is a classic Montgomery ladder, with the main coordinates only -// reduced mod 2 * p_25519, some intermediate results even more loosely. + mov w0, #19 + add x0, x0, x0, lsl #32 + mov v31.d[0], x0 + mov v31.d[1], xzr - mov i, #253 + mov x0, #(1<<26)-1 + mov v30.d[0], x0 + mov v30.d[1], x0 -curve25519_x25519_scalarloop: + mov x0, #0x07fffffe07fffffe + sub x1, x0, #0xfe-0xb4 + sub x0, x0, #2 -// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn + stp x0, x1, [mask1] + ldp d29, d28, [mask1] - sub_twice4(dm,xm,zm) - add_twice4(sn,xn,zn) - sub_twice4(dn,xn,zn) - add_twice4(sm,xm,zm) +// The main loop over (modified) bits from i = 254, ..., i = 0 (inclusive); +// we explicitly skip bit 255 because it should be forced to zero initially. +// This is a classic Montgomery ladder using a "swap" variable. +// It's assumed x0 = i at the start of the loop, but that is volatile and +// needs to be reloaded from memory at the end of the loop. -// ADDING: dmsn = dm * sn -// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + str xzr, [swap] + mov x0, #254 + str x0, [i] - mul_4(dmsn,sn,dm) +curve25519_x25519_scalarloop: - lsr x0, i, #6 - ldr x2, [sp, x0, lsl #3] // Exploiting scalar = sp exactly - lsr x2, x2, i + lsr x1, x0, #6 + ldr x2, [sp, x1, lsl #3] // Exploiting scalar = sp exactly + lsr x2, x2, x0 and x2, x2, #1 - cmp swap, x2 - mov swap, x2 - - mux_4(d,dm,dn) - mux_4(s,sm,sn) - -// ADDING: dnsm = sm * dn - - mul_4(dnsm,sm,dn) - -// DOUBLING: d = (xt - zt)^2 - - sqr_4(d,d) - -// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 -// DOUBLING: s = (xt + zt)^2 - - sub_twice4(dpro,dmsn,dnsm) - sqr_4(s,s) - add_twice4(spro,dmsn,dnsm) - sqr_4(dpro,dpro) - -// DOUBLING: p = 4 * xt * zt = s - d - - sub_twice4(p,s,d) - -// ADDING: xm' = (dmsn + dnsm)^2 - - sqr_4(xm,spro) - -// DOUBLING: e = 121666 * p + d - - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - -// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d - - mul_4(xn,s,d) - -// ADDING: zm' = x * (dmsn - dnsm)^2 - - mul_4(zm,dpro,pointx) - -// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) -// = p * (d + 121666 * p) - - mul_4(zn,p,e) - -// Loop down as far as 3 (inclusive) - - sub i, i, #1 - cmp i, #3 + ldr x0, [swap] + cmp x0, x2 + str x2, [swap] + +// The following inner loop code is derived closely following Lenngren's +// implementation available at "https://github.com/Emill/X25519-AArch64". +// In particular, the basic dataflow and the organization between integer +// and SIMD units is identical, with only a few minor changes to some +// individual instructions (for miscellaneous reasons). The scheduling +// was redone from scratch by SLOTHY starting from Hanno Becker's +// un-interleaved form and using the same scripts as in Becker et al's +// paper. +// +// The intermediate value annotations were added to provide data that +// is used in the formal proof, indicating which lines assign specific +// digits of the various intermediate results (mainly of field +// operations, sometimes other transformations). The names used for +// the intermediate results are similar but not identical to those in +// the abstract Algorithm 1 description in Lenngren's paper. Almost +// all equations are to be interpreted as field operations, i.e. as +// arithmetic modulo 2^255-19, not simple numeric equalities. +// +// b = x2 - z2 +// d = x3 - z3 +// a = x2 + z2 +// c = x3 + z3 +// f = if flip then c else a +// g = if flip then d else b +// aa = f^2 +// bb = g^2 +// bbalt = bb (change of representation) +// e = aa - bb +// bce = bbalt + 121666 * e +// z4 = bce * e +// bc = b * c +// ad = a * d +// t1 = ad + bc +// t2 = ad - bc +// x5 = t1^2 +// t3 = t2^2 +// x4 = aa * bb +// z5 = x * t3 +// +// Then the main variables are updated for the next iteration as +// +// (x2',z2') = (x4,z4) +// (x3',z3') = (x5,z5) + + add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2S, v28.2S, v1.2S + add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2S, v29.2S, v3.2S + add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2S, v29.2S, v15.2S + sub v1.2S, v29.2S, v5.2S + sub v26.2S, v28.2S, v11.2S + sub v21.2S, v29.2S, v19.2S + add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2S, v29.2S, v17.2S + add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2S, v29.2S, v13.2S + sub v13.2S, v29.2S, v7.2S + add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2S, v29.2S, v9.2S + add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f + add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f + add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f + mov x0, v20.d[0] + fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f + mov x21, v12.d[0] + fcsel d29, d7, d21, eq // ubignum_of_qreglist 4 // INTERMEDIATE g + mov x5, v16.d[0] + lsr x26, x0, #32 + add x29, x21, x21 + umull x15, w5, w29 + add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add x12, x26, x26 + mov x30, v5.d[0] + fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g + lsr x11, x5, #32 + lsr x10, x30, #32 + trn2 v20.2S, v21.2S, v3.2S + add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + add x14, x11, x11 + trn2 v6.2S, v2.2S, v15.2S + trn1 v12.2S, v25.2S, v0.2S + add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2S, v23.2S, v13.2S + fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g + trn2 v17.2S, v27.2S, v24.2S + str d29, [tmpb+32] + add x17, x10, x10 + trn2 v4.2S, v28.2S, v1.2S + trn1 v5.2S, v28.2S, v1.2S + trn1 v28.2S, v2.2S, v15.2S + trn1 v2.2S, v22.2S, v18.2S + fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g + trn2 v15.2S, v22.2S, v18.2S + umull v22.2D, v12.2S, v20.2S + umull x22, w30, w17 + stp d29, d10, [tmpb+0] + trn2 v10.2S, v23.2S, v13.2S + trn2 v23.2S, v11.2S, v14.2S + trn1 v13.2S, v27.2S, v24.2S + fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g + trn1 v14.2S, v11.2S, v14.2S + umlal v22.2D, v2.2S, v6.2S + umull x25, w30, w30 + umlal v22.2D, v5.2S, v23.2S + add x3, x30, x30 + umlal v22.2D, v16.2S, v17.2S + add w30, w21, w21, lsl #1; + stp d27, d8, [tmpb+16] + add w30, w30, w21, lsl #4 + trn1 v11.2S, v26.2S, v19.2S + trn2 v8.2S, v26.2S, v19.2S + trn2 v19.2S, v25.2S, v0.2S + mul v29.2S, v20.2S, v31.2S + ldr x20, [tmpb+24] + umull v25.2D, v19.2S, v6.2S + add x1, x0, x0 + umull v27.2D, v19.2S, v23.2S + umull x9, w5, w1 + umull v0.2D, v12.2S, v23.2S + lsr x24, x20, #32 + mul v20.2S, v23.2S, v31.2S + lsr x16, x21, #32 + umlal v25.2D, v15.2S, v23.2S + umaddl x13, w11, w14, x9 + umlal v25.2D, v4.2S, v17.2S + umaddl x9, w14, w17, x15 + umull v24.2D, v12.2S, v6.2S + add w2, w16, w16, lsl #1; + fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f + add w2, w2, w16, lsl #4 + trn1 v18.2S, v21.2S, v3.2S + umull v3.2D, v19.2S, v29.2S + umull x28, w5, w3 + mul v1.2S, v6.2S, v31.2S + umull x8, w5, w5 + umlal v24.2D, v2.2S, v23.2S + umaddl x13, w21, w30, x13 + mul v23.2S, v17.2S, v31.2S + umaddl x27, w14, w12, x28 + trn2 v6.2S, v9.2S, v7.2S + mov x6, v26.d[0] + umlal v3.2D, v15.2S, v1.2S + add x16, x16, x16 + umlal v3.2D, v4.2S, v20.2S + lsr x4, x6, #32 + umlal v3.2D, v10.2S, v23.2S + add x7, x6, x6 + umull v26.2D, v19.2S, v8.2S + add x23, x4, x4 + umaddl x28, w5, w23, x22 + trn1 v7.2S, v9.2S, v7.2S + umlal v27.2D, v15.2S, v17.2S + add w15, w4, w4, lsl #1; + umlal v27.2D, v4.2S, v8.2S + add w15, w15, w4, lsl #4 + add w22, w10, w10, lsl #1; + umlal v24.2D, v5.2S, v17.2S + add w22, w22, w10, lsl #4 + umaddl x10, w11, w7, x28 + umlal v25.2D, v10.2S, v8.2S + umull x21, w5, w16 + umlal v25.2D, v6.2S, v29.2S + umaddl x23, w15, w23, x25 + umlal v27.2D, v10.2S, v29.2S + umull x19, w5, w12 + umlal v27.2D, v6.2S, v1.2S + umaddl x25, w11, w29, x21 + umlal v0.2D, v2.2S, v17.2S + umaddl x28, w0, w3, x9 + shl v21.2D, v25.2D, #1 + umaddl x4, w11, w1, x19 + umaddl x21, w2, w29, x4 + mul v25.2S, v8.2S, v31.2S + umlal v24.2D, v16.2S, v8.2S + umaddl x19, w0, w17, x25 + umlal v24.2D, v7.2S, v29.2S + umull x25, w5, w17 + umlal v24.2D, v19.2S, v28.2S + umaddl x4, w0, w16, x10 + umull v9.2D, v12.2S, v8.2S + umaddl x23, w5, w7, x23 + umlal v21.2D, v12.2S, v18.2S + add w10, w6, w6, lsl #1; + shl v27.2D, v27.2D, #1 + add w10, w10, w6, lsl #4 + umaddl x28, w26, w12, x28 + umlal v26.2D, v15.2S, v29.2S + umaddl x9, w14, w16, x23 + umlal v9.2D, v2.2S, v29.2S + umaddl x22, w22, w17, x8 + umlal v21.2D, v2.2S, v28.2S + umaddl x28, w6, w10, x28 + umaddl x27, w0, w0, x27 + add x8, x14, x14 + umlal v0.2D, v5.2S, v8.2S + umull x5, w5, w14 + umlal v9.2D, v5.2S, v1.2S + umaddl x14, w0, w29, x9 + umlal v26.2D, v4.2S, v1.2S + umaddl x6, w2, w16, x27 + umlal v22.2D, v7.2S, v8.2S + umaddl x5, w30, w17, x5 + umaddl x5, w2, w3, x5 + add x23, x17, x17 + umlal v27.2D, v12.2S, v28.2S + umaddl x13, w2, w23, x13 + umlal v26.2D, v10.2S, v20.2S + add x9, x12, x12 + umlal v9.2D, v16.2S, v20.2S + umaddl x27, w10, w29, x6 + umlal v0.2D, v16.2S, v29.2S + umaddl x6, w11, w3, x25 + umlal v22.2D, v19.2S, v18.2S + umaddl x19, w26, w3, x19 + mul v18.2S, v18.2S, v31.2S + umaddl x23, w15, w23, x27 + umlal v3.2D, v6.2S, v25.2S + umaddl x0, w0, w12, x6 + umlal v0.2D, v7.2S, v1.2S + add x11, x16, x16 + umlal v9.2D, v7.2S, v23.2S + umaddl x6, w12, w17, x14 + umlal v9.2D, v19.2S, v11.2S + umaddl x25, w26, w29, x4 + umlal v9.2D, v15.2S, v18.2S + umaddl x14, w10, w3, x13 + umull v25.2D, v12.2S, v17.2S + umaddl x27, w10, w16, x0 + umlal v26.2D, v6.2S, v23.2S + add x0, x25, x6, lsr #26 + mul v23.2S, v28.2S, v31.2S + umaddl x12, w10, w12, x5 + shl v3.2D, v3.2D, #1 + add x16, x22, x0, lsr #25 + umlal v21.2D, v5.2S, v14.2S + bic x22, x0, #0x1ffffff + umlal v3.2D, v12.2S, v11.2S + add x26, x16, x22, lsr #24 + umlal v3.2D, v2.2S, v18.2S + umaddl x16, w10, w17, x21 + umlal v3.2D, v5.2S, v23.2S + add x22, x26, x22, lsr #21 + umlal v9.2D, v4.2S, v23.2S + umaddl x5, w15, w29, x27 + umull v17.2D, v19.2S, v17.2S + umaddl x17, w30, w3, x22 + umlal v25.2D, v2.2S, v8.2S + umaddl x25, w15, w3, x16 + umlal v25.2D, v5.2S, v29.2S + umaddl x26, w15, w7, x19 + umlal v0.2D, v19.2S, v14.2S + umaddl x17, w2, w9, x17 + umlal v17.2D, v15.2S, v8.2S + ldr x19, [tmpb+0] + umlal v17.2D, v4.2S, v29.2S + ldr x7, [tmpb+8] + shl v29.2D, v26.2D, #1 + umaddl x13, w10, w1, x17 + umlal v0.2D, v15.2S, v13.2S + lsr x2, x19, #32 + umlal v29.2D, v12.2S, v13.2S + umaddl x27, w15, w1, x12 + umlal v29.2D, v2.2S, v11.2S + umaddl x30, w15, w8, x13 + umlal v29.2D, v5.2S, v18.2S + add x4, x7, x7 + umlal v29.2D, v16.2S, v23.2S + umaddl x29, w15, w9, x14 + umlal v0.2D, v4.2S, v11.2S + add x17, x27, x30, lsr #26 + umlal v0.2D, v10.2S, v18.2S + umaddl x16, w15, w11, x28 + umlal v0.2D, v6.2S, v23.2S + add x1, x29, x17, lsr #25 + umlal v25.2D, v16.2S, v1.2S + umull x11, w19, w4 + ldr x8, [tmpb+32] + mul v26.2S, v14.2S, v31.2S + umlal v17.2D, v10.2S, v1.2S + ldr x15, [tmpb+16] + umlal v17.2D, v6.2S, v20.2S + and x9, x30, #0x3ffffff + bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa + add x17, x2, x2 + lsr x10, x15, #32 + add x27, x25, x1, lsr #26 + umlal v25.2D, v7.2S, v20.2S + add x13, x10, x10 + umlal v25.2D, v19.2S, v13.2S + add x29, x23, x27, lsr #25 + umlal v25.2D, v15.2S, v11.2S + lsr x30, x8, #32 + umlal v25.2D, v4.2S, v18.2S + add x23, x5, x29, lsr #26 + umlal v25.2D, v10.2S, v23.2S + and x14, x29, #0x3ffffff + umlal v25.2D, v6.2S, v26.2S + add x5, x16, x23, lsr #25 + shl v8.2D, v17.2D, #1 + umaddl x12, w2, w17, x11 + and x29, x5, #0x3ffffff + umull x21, w19, w19 + umlal v29.2D, v7.2S, v26.2S + add w16, w10, w10, lsl #1; + umlal v3.2D, v16.2S, v26.2S + add w16, w16, w10, lsl #4 + bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa + add w10, w24, w24, lsl #1; + add x22, x26, x5, lsr #26 + add w10, w10, w24, lsl #4 + umlal v8.2D, v12.2S, v14.2S + umaddl x25, w16, w13, x21 + umlal v8.2D, v2.2S, v13.2S + bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa + umlal v8.2D, v5.2S, v11.2S + add x26, x24, x24 + umlal v8.2D, v16.2S, v18.2S + stp x14, x29, [tmpa+16] + umlal v8.2D, v7.2S, v23.2S + add w24, w30, w30, lsl #1; + usra v25.2D, v29.2D, #26 + add w24, w24, w30, lsl #4 + umull x29, w15, w15 + umlal v27.2D, v2.2S, v14.2S + umull x3, w15, w13 + umlal v27.2D, v5.2S, v13.2S + add x21, x20, x20 + umlal v24.2D, v15.2S, v14.2S + umull x5, w19, w21 + umlal v24.2D, v4.2S, v13.2S + and x11, x1, #0x3ffffff + usra v8.2D, v25.2D, #25 + and x1, x0, #0x1ffffff + umlal v27.2D, v16.2S, v11.2S + umaddl x23, w17, w13, x5 + umlal v27.2D, v7.2S, v18.2S + add x5, x30, x30 + usra v0.2D, v8.2D, #26 + add x0, x15, x15 + umlal v24.2D, v10.2S, v11.2S + umaddl x23, w7, w0, x23 + umlal v24.2D, v6.2S, v18.2S + lsr x30, x7, #32 + usra v27.2D, v0.2D, #25 + add x16, x30, x30 + and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + umaddl x15, w30, w16, x23 + ushr v23.2D, v30.2D, #1 + add w23, w8, w8, lsl #1; + usra v24.2D, v27.2D, #26 + add w23, w23, w8, lsl #4 + umaddl x14, w19, w5, x3 + and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + add x28, x8, x8 + and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + umaddl x8, w8, w23, x15 + and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + umaddl x3, w2, w28, x14 + umlal v22.2D, v15.2S, v28.2S + bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa + uzp1 v5.4S, v8.4S, v5.4S + umaddl x14, w24, w5, x29 + umaddl x5, w19, w28, x14 + ldr d18, [mask1] + mov v18.d[1], v18.d[0] + umaddl x15, w7, w26, x3 + mul v12.2S, v13.2S, v31.2S + umlal v21.2D, v16.2S, v13.2S + stp x9, x11, [tmpa+0] + umlal v21.2D, v7.2S, v11.2S + umaddl x29, w17, w26, x5 + umlal v22.2D, v4.2S, v14.2S + add w14, w20, w20, lsl #1; + umlal v22.2D, v10.2S, v13.2S + add w14, w14, w20, lsl #4 + umull x3, w19, w0 + umlal v22.2D, v6.2S, v11.2S + umaddl x29, w7, w21, x29 + usra v21.2D, v24.2D, #25 + umaddl x11, w20, w14, x12 + and v0.16B, v25.16B, v23.16B + umaddl x5, w30, w21, x15 + and v14.16B, v29.16B, v30.16B + umaddl x12, w16, w13, x29 + usra v22.2D, v21.2D, #26 + umaddl x29, w17, w16, x3 + umlal v3.2D, v7.2S, v12.2S + add x9, x26, x26 + and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + add x27, x5, x12, lsr #26 + bic v8.16B, v22.16B, v23.16B + umaddl x29, w7, w7, x29 + and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + add x5, x25, x27, lsr #25 + usra v3.2D, v8.2D, #25 + umaddl x25, w24, w9, x8 + umlal v9.2D, v10.2S, v26.2S + add x8, x13, x13 + trn1 v22.4S, v1.4S, v17.4S + umaddl x11, w10, w8, x11 + usra v3.2D, v8.2D, #24 + umull x20, w19, w16 + add v26.2S, v22.2S, v18.2S + ldr d28, [mask2] + umlal v9.2D, v6.2S, v12.2S + umaddl x3, w23, w0, x11 + usra v3.2D, v8.2D, #21 + umaddl x29, w10, w26, x29 + uzp1 v11.4S, v20.4S, v27.4S + umaddl x20, w2, w4, x20 + umaddl x9, w10, w21, x20 + mov v17.d[0], v22.d[1] + usra v9.2D, v3.2D, #26 + umull x15, w19, w13 + and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + add x11, x16, x16 + uzp2 v1.4S, v11.4S, v5.4S + umaddl x20, w23, w13, x9 + and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + umaddl x9, w2, w0, x15 + usra v14.2D, v9.2D, #25 + and x6, x6, #0x3ffffff + uzp1 v7.4S, v7.4S, v8.4S + umaddl x29, w23, w21, x29 + uzp1 v27.4S, v11.4S, v5.4S + umull x15, w19, w26 + usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + add x6, x6, x22, lsr #25 + and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + bic x22, x27, #0x1ffffff + sub v2.2S, v26.2S, v17.2S + add v9.2S, v22.2S, v17.2S + uzp1 v14.4S, v3.4S, v0.4S + umaddl x2, w2, w21, x15 + add v5.4S, v27.4S, v18.4S + add x5, x5, x22, lsr #24 + zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + mov v18.b[0], v28.b[0] + uzp1 v8.4S, v7.4S, v14.4S + add x22, x5, x22, lsr #21 + uzp2 v3.4S, v7.4S, v14.4S + umaddl x5, w7, w16, x9 + add v25.4S, v8.4S, v18.4S + umaddl x15, w14, w0, x22 + add v12.4S, v27.4S, v1.4S + add x9, x17, x17 + sub v14.4S, v5.4S, v1.4S + umull x19, w19, w17 + sub v18.4S, v25.4S, v3.4S + ldr x22, [tmpa+8] + add v20.4S, v8.4S, v3.4S + umaddl x15, w10, w11, x15 + zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + umaddl x14, w14, w13, x19 + zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + and x17, x27, #0x1ffffff + zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + umaddl x15, w23, w4, x15 + zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + umaddl x10, w10, w0, x14 + zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2S, v0.2S, #1 + mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 + shl v26.2S, v22.2S, #1 + shl v17.2S, v16.2S, #1 + mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 + shl v7.2S, v5.2S, #1 + shl v18.2S, v19.2S, #1 + umull v11.2D, v1.2S, v24.2S + umaddl x19, w23, w16, x10 + umull v6.2D, v1.2S, v17.2S + umaddl x10, w7, w13, x2 + mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 + mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 + umull v9.2D, v1.2S, v26.2S + ldr x13, [tmpa+0] + shl v28.2S, v15.2S, #1 + shl v3.2S, v10.2S, #1 + ldr x14, [tmpa+16] + mul v12.2S, v10.2S, v31.2S + umull v25.2D, v1.2S, v7.2S + ldr x2, [tmpa+24] + umlal v6.2D, v18.2S, v28.2S + umaddl x27, w30, w0, x10 + umaddl x16, w24, w0, x20 + shl v13.2S, v14.2S, #1 + umaddl x5, w23, w26, x5 + mul v2.2S, v22.2S, v31.2S + umull v21.2D, v1.2S, v13.2S + umaddl x23, w24, w8, x29 + umlal v11.2D, v18.2S, v19.2S + mov x10, #0x07fffffe07fffffe + sub x10, x10, #2 + umaddl x26, w24, w21, x5 + mul v29.2S, v14.2S, v31.2S + umlal v25.2D, v19.2S, v26.2S + add x7, x1, x6, lsr #26 + mul v20.2S, v4.2S, v31.2S + and x6, x6, #0x3ffffff + shl v8.2S, v18.2S, #1 + shl v4.2S, v4.2S, #1 + umlal v11.2D, v29.2S, v14.2S + bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa + umlal v25.2D, v0.2S, v3.2S + umaddl x0, w24, w4, x19 + umlal v25.2D, v15.2S, v13.2S + str x6, [tmpa+32] + umlal v21.2D, v18.2S, v4.2S + umaddl x8, w24, w11, x3 + umlal v21.2D, v0.2S, v17.2S + ldr x30, [tmpa+32] + mul v14.2S, v5.2S, v31.2S + add x2, x2, x10 + shl v5.2S, v28.2S, #1 + shl v27.2S, v4.2S, #1 + umlal v6.2D, v0.2S, v0.2S + umaddl x11, w24, w9, x15 + umlal v6.2D, v12.2S, v3.2S + add x4, x30, x10 + umlal v11.2D, v14.2S, v5.2S + add x3, x22, x10 + umlal v11.2D, v2.2S, v17.2S + add x6, x0, x11, lsr #26 + umlal v11.2D, v12.2S, v27.2S + add x14, x14, x10 + umlal v6.2D, v14.2S, v27.2S + add x8, x8, x6, lsr #25 + umlal v6.2D, v2.2S, v13.2S + movk x10, #0xffb4 + umlal v25.2D, v16.2S, v4.2S + add x29, x16, x8, lsr #26 + umull v27.2D, v1.2S, v3.2S + and x11, x11, #0x3ffffff + umlal v9.2D, v18.2S, v3.2S + add x19, x13, x10 + umlal v9.2D, v0.2S, v13.2S + and x5, x8, #0x3ffffff + umlal v9.2D, v28.2S, v4.2S + bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb + umlal v9.2D, v16.2S, v16.2S + umaddl x30, w24, w28, x27 + umlal v9.2D, v14.2S, v7.2S + sub x13, x19, x11 + umull v10.2D, v1.2S, v18.2S + add x7, x23, x29, lsr #25 + umlal v21.2D, v28.2S, v15.2S + lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e + umlal v21.2D, v2.2S, v22.2S + add x0, x26, x7, lsr #26 + usra v25.2D, v9.2D, #26 + and x20, x7, #0x3ffffff + umull v22.2D, v1.2S, v1.2S + add x8, x25, x0, lsr #25 + umull v7.2D, v1.2S, v28.2S + and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt + bic v18.16B, v25.16B, v23.16B + and x19, x8, #0x3ffffff + and v16.16B, v9.16B, v30.16B + and x7, x12, #0x3ffffff + usra v22.2D, v18.2D, #25 + add x10, x30, x8, lsr #26 + umlal v7.2D, v19.2S, v24.2S + bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb + and v9.16B, v25.16B, v23.16B + add x27, x7, x10, lsr #25 + usra v22.2D, v18.2D, #24 + mov x21, #60833 + lsl x21, x21, #1 + add x15, x17, x27, lsr #26 + shl v25.2S, v3.2S, #1 + umlal v7.2D, v14.2S, v17.2S + and x29, x27, #0x3ffffff + usra v22.2D, v18.2D, #21 + bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt + umlal v10.2D, v14.2S, v24.2S + and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt + umlal v10.2D, v2.2S, v28.2S + sub x6, x3, x5 + umlal v10.2D, v12.2S, v17.2S + umaddl x25, w16, w21, x17 + umlal v10.2D, v29.2S, v4.2S + mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt + umlal v22.2D, v20.2S, v4.2S + lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e + umlal v22.2D, v14.2S, v8.2S + and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt + umlal v22.2D, v2.2S, v24.2S + stp x11, x5, [tmpb+0] + umlal v22.2D, v12.2S, v5.2S + bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb + umlal v22.2D, v29.2S, v17.2S + umaddl x12, w6, w21, x12 + umull v18.2D, v1.2S, v4.2S + bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb + umlal v7.2D, v2.2S, v4.2S + sub x7, x14, x20 + umlal v27.2D, v19.2S, v13.2S + mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt + usra v10.2D, v22.2D, #26 + lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e + umlal v18.2D, v19.2S, v17.2S + and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt + umlal v7.2D, v12.2S, v13.2S + sub x5, x2, x19 + usra v11.2D, v10.2D, #25 + mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt + umlal v27.2D, v0.2S, v4.2S + umlal v21.2D, v14.2S, v25.2S + sub x23, x4, x29 + usra v7.2D, v11.2D, #26 + mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt + umlal v18.2D, v0.2S, v28.2S + lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e + umlal v27.2D, v15.2S, v17.2S + str x29, [tmpb+32] + usra v6.2D, v7.2D, #25 + mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt + and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + umaddl x27, w26, w21, x1 + umlal v18.2D, v14.2S, v13.2S + umaddl x30, w23, w21, x0 + umlal v18.2D, v2.2S, v3.2S + lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e + and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + umaddl x4, w14, w21, x24 + ldr x0, [tmpa+0] + mov v0.s[1], w0 + lsr x0, x0, #32 + mov v1.s[1], w0 + umaddl x9, w7, w21, x8 + usra v18.2D, v6.2D, #26 + umaddl x24, w10, w21, x28 + and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + umaddl x8, w22, w21, x15 + umlal v27.2D, v14.2S, v26.2S + umaddl x15, w13, w21, x17 + usra v21.2D, v18.2D, #25 + stp x20, x19, [tmpb+16] + and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + lsr x29, x8, #25 + ldr x3, [tmpb+0] + mov v10.s[1], w3 + lsr x3, x3, #32 + mov v11.s[1], w3 + add x17, x15, x29 + usra v27.2D, v21.2D, #26 + add x28, x17, x29, lsl #1 + and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and x20, x8, #0x1ffffff + and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + add x17, x28, x29, lsl #4 + and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + ldr x3, [tmpb+8] + mov v22.s[1], w3 + lsr x3, x3, #32 + mov v23.s[1], w3 + add x29, x25, x17, lsr #26 + ldr x15, [pointx+0] + mov v10.s[0], w15 + lsr x15, x15, #32 + mov v11.s[0], w15 + and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce + usra v16.2D, v27.2D, #25 + add x8, x12, x29, lsr #25 + ldr x3, [tmpb+16] + mov v14.s[1], w3 + lsr x3, x3, #32 + mov v15.s[1], w3 + and x12, x29, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bce + ldr x15, [pointx+8] + mov v22.s[0], w15 + lsr x15, x15, #32 + mov v23.s[0], w15 + add x28, x27, x8, lsr #26 + and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + umull x1, w12, w10 + ldr x3, [tmpb+24] + mov v17.s[1], w3 + lsr x3, x3, #32 + mov v18.s[1], w3 + add x25, x9, x28, lsr #25 + ldr x15, [pointx+16] + mov v14.s[0], w15 + lsr x15, x15, #32 + mov v15.s[0], w15 + umaddl x19, w5, w21, x2 + usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + add x2, x4, x25, lsr #26 + ldr x3, [tmpb+32] + mov v24.s[1], w3 + lsr x3, x3, #32 + mov v25.s[1], w3 + umull x3, w12, w23 + ldr x15, [pointx+24] + mov v17.s[0], w15 + lsr x15, x15, #32 + mov v18.s[0], w15 + add x29, x19, x2, lsr #25 + umull v26.2D, v0.2S, v23.2S + and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce + ldr x0, [tmpa+8] + mov v2.s[1], w0 + lsr x0, x0, #32 + mov v3.s[1], w0 + umaddl x27, w21, w5, x3 + ldr x15, [pointx+32] + mov v24.s[0], w15 + lsr x15, x15, #32 + mov v25.s[0], w15 + add x17, x24, x29, lsr #26 + umull v29.2D, v1.2S, v18.2S + and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce + umull v20.2D, v0.2S, v15.2S + add x19, x30, x17, lsr #25 + and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce + mul v12.2S, v25.2S, v31.2S + ldr x0, [tmpa+16] + mov v4.s[1], w0 + lsr x0, x0, #32 + mov v5.s[1], w0 + add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v26.2D, v2.2S, v11.2S + add w28, w3, w3, lsl #1; + umlal v20.2D, v2.2S, v23.2S + add w28, w28, w3, lsl #4 + umull x8, w12, w5 + ldr x0, [tmpa+24] + mov v6.s[1], w0 + lsr x0, x0, #32 + mov v7.s[1], w0 + and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce + mul v16.2S, v18.2S, v31.2S + add w17, w4, w4, lsl #1; + umull v21.2D, v1.2S, v15.2S + add w17, w17, w4, lsl #4 + umaddl x25, w21, w7, x8 + umlal v20.2D, v4.2S, v11.2S + add w8, w21, w21, lsl #1; + ldr x0, [tmpa+32] + add w8, w8, w21, lsl #4 + mov v8.s[1], w0 + lsr x0, x0, #32 + mov v9.s[1], w0 + and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce + umlal v29.2D, v3.2S, v15.2S + umaddl x24, w2, w6, x25 + umull v13.2D, v0.2S, v25.2S + umaddl x25, w2, w7, x27 + umaddl x0, w3, w6, x25 + mul v19.2S, v15.2S, v31.2S + umull v27.2D, v0.2S, v18.2S + umaddl x20, w3, w13, x24 + umlal v20.2D, v6.2S, v12.2S + umaddl x24, w21, w14, x1 + umlal v13.2D, v2.2S, v18.2S + umaddl x9, w4, w13, x0 + umull v25.2D, v0.2S, v11.2S + umaddl x20, w17, w23, x20 + umlal v27.2D, v2.2S, v15.2S + umaddl x0, w2, w26, x24 + umull v28.2D, v1.2S, v11.2S + umull x24, w17, w5 + umlal v29.2D, v5.2S, v23.2S + umaddl x9, w11, w22, x9 + umlal v13.2D, v4.2S, v15.2S + umaddl x27, w3, w16, x0 + umlal v27.2D, v4.2S, v23.2S + umull x0, w17, w14 + umlal v27.2D, v6.2S, v11.2S + umull x4, w12, w14 + umlal v27.2D, v8.2S, v12.2S + umaddl x25, w11, w10, x20 + umlal v27.2D, v1.2S, v17.2S + umaddl x0, w28, w10, x0 + umlal v13.2D, v6.2S, v23.2S + umull x3, w17, w6 + umlal v13.2D, v8.2S, v11.2S + umaddl x1, w21, w26, x4 + umlal v20.2D, v8.2S, v16.2S + umaddl x4, w2, w13, x24 + umlal v28.2D, v3.2S, v12.2S + umaddl x20, w28, w7, x3 + umlal v29.2D, v7.2S, v11.2S + and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v29.2D, v9.2S, v12.2S + umaddl x19, w17, w22, x27 + add w27, w2, w2, lsl #1; + mul v18.2S, v24.2S, v31.2S + add w27, w27, w2, lsl #4 + umlal v21.2D, v3.2S, v23.2S + umull x24, w17, w7 + umlal v13.2D, v1.2S, v24.2S + add x19, x19, x19 + shl v29.2D, v29.2D, #1 + umaddl x1, w2, w16, x1 + umull v15.2D, v1.2S, v23.2S + umaddl x0, w27, w22, x0 + umlal v29.2D, v0.2S, v24.2S + umaddl x2, w28, w5, x24 + mul v24.2S, v23.2S, v31.2S + umaddl x4, w28, w23, x4 + umlal v21.2D, v5.2S, v11.2S + umaddl x24, w27, w5, x20 + umlal v20.2D, v1.2S, v14.2S + umaddl x20, w11, w23, x19 + umlal v26.2D, v4.2S, v12.2S + umaddl x19, w27, w23, x2 + umlal v26.2D, v6.2S, v16.2S + umaddl x2, w21, w6, x4 + umlal v29.2D, v2.2S, v17.2S + umaddl x24, w8, w23, x24 + umlal v15.2D, v3.2S, v11.2S + umaddl x0, w21, w16, x0 + umaddl x4, w21, w13, x19 + mul v23.2S, v11.2S, v31.2S + umlal v20.2D, v3.2S, v22.2S + umaddl x2, w12, w7, x2 + umlal v20.2D, v5.2S, v10.2S + umaddl x19, w12, w26, x0 + umlal v29.2D, v4.2S, v14.2S + umaddl x0, w12, w13, x24 + umlal v26.2D, v8.2S, v19.2S + umaddl x20, w15, w5, x20 + umlal v26.2D, v1.2S, v22.2S + umaddl x21, w15, w10, x9 + umlal v26.2D, v3.2S, v10.2S + and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce + umlal v29.2D, v6.2S, v22.2S + umaddl x20, w30, w7, x20 + umaddl x1, w28, w22, x1 + add x24, x19, x19 + umull v11.2D, v1.2S, v12.2S + add w19, w3, w3, lsl #1; + umlal v26.2D, v5.2S, v18.2S + add w19, w19, w3, lsl #4 + umaddl x20, w9, w6, x20 + umlal v29.2D, v8.2S, v10.2S + add w29, w9, w9, lsl #1; + umlal v13.2D, v3.2S, v17.2S + add w29, w29, w9, lsl #4 + umaddl x2, w19, w10, x2 + umlal v11.2D, v3.2S, v16.2S + umaddl x21, w30, w14, x21 + umlal v11.2D, v5.2S, v19.2S + umaddl x20, w3, w13, x20 + umlal v11.2D, v7.2S, v24.2S + umaddl x2, w29, w22, x2 + umlal v11.2D, v9.2S, v23.2S + umaddl x21, w9, w26, x21 + ushr v23.2D, v30.2D, #1 + umaddl x1, w17, w10, x1 + umlal v13.2D, v5.2S, v14.2S + umaddl x24, w19, w5, x24 + umlal v27.2D, v3.2S, v14.2S + umaddl x21, w3, w16, x21 + shl v11.2D, v11.2D, #1 + add w3, w30, w30, lsl #1; + umlal v28.2D, v5.2S, v16.2S + add w3, w3, w30, lsl #4 + umaddl x24, w29, w23, x24 + umlal v28.2D, v7.2S, v19.2S + add x1, x1, x1 + umlal v28.2D, v9.2S, v24.2S + umaddl x1, w11, w5, x1 + umlal v15.2D, v5.2S, v12.2S + umaddl x24, w30, w13, x24 + umlal v15.2D, v7.2S, v16.2S + umaddl x25, w15, w14, x25 + umlal v15.2D, v9.2S, v19.2S + umaddl x1, w15, w7, x1 + shl v28.2D, v28.2D, #1 + umaddl x24, w15, w6, x24 + umlal v21.2D, v7.2S, v12.2S + umaddl x2, w30, w16, x2 + umlal v21.2D, v9.2S, v16.2S + umaddl x25, w30, w26, x25 + shl v15.2D, v15.2D, #1 + umaddl x30, w30, w6, x1 + umlal v28.2D, v0.2S, v22.2S + umaddl x1, w15, w26, x2 + umlal v28.2D, v2.2S, v10.2S + umaddl x2, w9, w16, x25 + shl v21.2D, v21.2D, #1 + umaddl x24, w11, w7, x24 + umlal v15.2D, v0.2S, v14.2S + umaddl x1, w11, w14, x1 + umlal v21.2D, v0.2S, v17.2S + umaddl x25, w9, w13, x30 + umlal v28.2D, v4.2S, v18.2S + umaddl x0, w19, w26, x0 + umlal v25.2D, v2.2S, v12.2S + add x1, x1, x24, lsr #26 + umlal v25.2D, v4.2S, v16.2S + umaddl x30, w19, w22, x2 + umlal v21.2D, v2.2S, v14.2S + umaddl x4, w12, w6, x4 + mul v14.2S, v14.2S, v31.2S + umaddl x25, w19, w23, x25 + and x2, x1, #0x1ffffff + mul v16.2S, v17.2S, v31.2S + umlal v25.2D, v6.2S, v19.2S + umaddl x9, w19, w14, x4 + umlal v13.2D, v7.2S, v22.2S + add x25, x25, x1, lsr #25 + umlal v21.2D, v4.2S, v22.2S + umaddl x0, w29, w14, x0 + umlal v26.2D, v7.2S, v16.2S + add x30, x30, x25, lsr #26 + umlal v26.2D, v9.2S, v14.2S + add w1, w15, w15, lsl #1; + umlal v28.2D, v6.2S, v16.2S + add w1, w1, w15, lsl #4 + add x4, x20, x30, lsr #25 + umlal v28.2D, v8.2S, v14.2S + and x25, x25, #0x3ffffff + umlal v15.2D, v2.2S, v22.2S + add x21, x21, x4, lsr #26 + umlal v11.2D, v0.2S, v10.2S + bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 + umlal v11.2D, v2.2S, v18.2S + bic x30, x21, #0x3ffffff + usra v26.2D, v28.2D, #26 + lsr x20, x30, #26 + umlal v15.2D, v4.2S, v10.2S + add x20, x20, x30, lsr #25 + umlal v15.2D, v6.2S, v18.2S + umaddl x9, w29, w10, x9 + umlal v15.2D, v8.2S, v16.2S + add x30, x20, x30, lsr #22 + umlal v27.2D, v5.2S, v22.2S + umull x20, w17, w26 + umlal v20.2D, v7.2S, v18.2S + umaddl x30, w17, w16, x30 + umlal v20.2D, v9.2S, v16.2S + umaddl x17, w3, w10, x0 + usra v15.2D, v26.2D, #25 + umaddl x0, w28, w14, x20 + umlal v27.2D, v7.2S, v10.2S + umaddl x20, w28, w26, x30 + umlal v27.2D, v9.2S, v18.2S + add w28, w12, w12, lsl #1; + usra v20.2D, v15.2D, #26 + add w28, w28, w12, lsl #4 + umaddl x30, w27, w10, x0 + and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + umaddl x27, w27, w14, x20 + umaddl x0, w8, w10, x27 + mul v12.2S, v22.2S, v31.2S + and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + umaddl x14, w3, w22, x9 + umlal v21.2D, v6.2S, v10.2S + umaddl x27, w8, w22, x30 + trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + umaddl x10, w28, w22, x0 + umlal v11.2D, v4.2S, v16.2S + umaddl x30, w15, w16, x14 + and v26.16B, v26.16B, v23.16B + umaddl x28, w12, w16, x27 + umlal v21.2D, v8.2S, v18.2S + add x10, x10, x10 + umlal v25.2D, v8.2S, v24.2S + umaddl x20, w19, w6, x10 + umlal v25.2D, v1.2S, v10.2S + add x28, x28, x28 + umlal v25.2D, v3.2S, v18.2S + umaddl x28, w19, w7, x28 + usra v21.2D, v20.2D, #25 + umaddl x0, w29, w7, x20 + umlal v11.2D, v6.2S, v14.2S + umaddl x10, w11, w26, x30 + umlal v13.2D, v9.2S, v10.2S + umaddl x19, w29, w5, x28 + usra v27.2D, v21.2D, #26 + umaddl x0, w3, w5, x0 + umlal v25.2D, v5.2S, v16.2S + umaddl x20, w1, w22, x17 + and v20.16B, v28.16B, v30.16B + umaddl x29, w3, w23, x19 + usra v29.2D, v27.2D, #25 + umaddl x3, w1, w23, x0 + and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2D, v8.2S, v12.2S + umaddl x12, w15, w13, x29 + usra v13.2D, v29.2D, #26 + umaddl x7, w11, w13, x3 + trn1 v6.4S, v6.4S, v7.4S + umaddl x17, w11, w16, x20 + umlal v25.2D, v7.2S, v14.2S + and x23, x4, #0x3ffffff + bic v19.16B, v13.16B, v23.16B + umaddl x19, w11, w6, x12 + and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + add x3, x17, x7, lsr #26 + usra v11.2D, v19.2D, #25 + trn1 v2.4S, v2.4S, v3.4S + add x17, x19, x3, lsr #25 + and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and x5, x7, #0x3ffffff + usra v11.2D, v19.2D, #24 + add x7, x10, x17, lsr #26 + trn1 v0.4S, v0.4S, v1.4S + and x19, x24, #0x3ffffff + and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + add x29, x19, x7, lsr #25 + usra v11.2D, v19.2D, #21 + bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 + trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + add x19, x2, x29, lsr #26 + trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + and x3, x29, #0x3ffffff + mov v16.d[0], v6.d[1] // FINAL x3 + mov v6.d[0], v17.d[1] // FINAL x2 + trn1 v8.4S, v8.4S, v9.4S + bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 + and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 + mov v18.d[0], v8.d[1] // FINAL x3 + mov v8.d[0], v19.d[1] // FINAL x2 + umlal v25.2D, v9.2S, v12.2S + mov v9.d[0], x23 // FINAL z2 + mov v7.d[0], x25 // FINAL z2 + ldr d29, [mask1] + mov v12.d[0], v2.d[1] // FINAL x3 + trn1 v4.4S, v4.4S, v5.4S + and x17, x17, #0x3ffffff + usra v25.2D, v11.2D, #26 + mov v10.d[0], v0.d[1] // FINAL x3 + mov v14.d[0], v4.d[1] // FINAL x3 + mov v4.d[0], v15.d[1] // FINAL x2 + usra v20.2D, v25.2D, #25 + and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 + mov v5.d[0], x3 // depth 86 + mov v1.d[0], x5 // FINAL z2 + usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4S, v21.4S, v27.4S // FINAL z3 + trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + mov v0.d[0], v11.d[1] // FINAL x2 + mov v3.d[0], x17 // FINAL z2 + mov v2.d[0], v13.d[1] // FINAL x2 + ldr d28, [mask2] + + ldr x0, [i] + subs x0, x0, #1 + str x0, [i] bcs curve25519_x25519_scalarloop -// Multiplex directly into (xn,zn) then do three pure doubling steps; -// this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. - - cmp swap, xzr - mux_4(xn,xm,xn) - mux_4(zn,zm,zn) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_p25519(zn,p,e) - -// The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Repack X2 into the saturated representation as 256-bit value xn. +// This does not fully normalize mod 2^255-19 but stays within 256 bits. + + mov w0, v0.s[0] + mov w1, v0.s[1] + mov w2, v2.s[0] + mov w3, v2.s[1] + mov w4, v4.s[0] + mov w5, v4.s[1] + mov w6, v6.s[0] + mov w7, v6.s[1] + mov w8, v8.s[0] + mov w9, v8.s[1] + + add x0, x0, x1, lsl #26 + add x1, x2, x3, lsl #26 + add x2, x4, x5, lsl #26 + add x3, x6, x7, lsl #26 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [xn] + stp x2, x3, [xn+16] - add x0, xm +// Repack Z2 into the saturated representation as 256-bit value zn. +// This does not fully normalize mod 2^255-19. However since Z2, +// unlike X2, was not repacked (within the last multiplication) in +// right-to-left order, its top digit can be any 26-bit value, on +// the face of it. To make sure we don't overflow 256 bits here +// we remove b = 25th bit of the 9th digit (now scaled by 2^230 +// giving bit 25 a final weighting of 2^255) and add 19 * b to +// to the bottom of the sum here to compensate mod 2^255-19. + + mov w0, v1.s[0] + mov w1, v1.s[1] + mov w2, v3.s[0] + mov w3, v3.s[1] + mov w4, v5.s[0] + mov w5, v5.s[1] + mov w6, v7.s[0] + mov w7, v7.s[1] + mov w8, v9.s[0] + mov w9, v9.s[1] + + mov w10, #19 + add x0, x0, x1, lsl #26 + tst x9, #0x2000000 + add x1, x2, x3, lsl #26 + csel x10, x10, xzr, ne + add x2, x4, x5, lsl #26 + and x9, x9, #0x1FFFFFF + add x3, x6, x7, lsl #26 + add x0, x0, x10 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [zn] + stp x2, x3, [zn+16] + +// Because the lowest bit (indeed, the three lowest bits) of the scalar +// were forced to zero, we know that the projective result of the scalar +// multiplication was in (X2,Z2) and is now (xn,zn) in saturated form. +// Prepare to call the modular inverse function to get zn' = 1/zn. + + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -860,7 +1348,7 @@ curve25519_x25519_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn, and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1891,36 +2379,210 @@ curve25519_x25519_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. - - mul_p25519(resx,xn,xm) - -// Restore stack and registers - - add sp, sp, #NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. The multiplication below is just an inlined +// version of bignum_mul_p25519 except for the detailed +// addressing of inputs and outputs + + ldr x17, [res] + + ldp x3, x4, [xn] + ldp x5, x6, [zn] + umull x7, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x8, w16, w0 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [xn+16] + ldp x5, x6, [zn+16] + umull x11, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x12, w16, w0 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [xn+16] + ldp x15, x16, [xn] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, cc + ldp x15, x0, [zn] + subs x5, x15, x5 + sbcs x6, x0, x6 + csetm x0, cc + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x0 + subs x5, x5, x0 + eor x6, x6, x0 + sbc x6, x6, x0 + eor x16, x0, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x0, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x9, cc + adds x15, x15, x0 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, cc + cinv x9, x9, cc + mul x5, x4, x6 + umulh x6, x4, x6 + adds x0, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #0x1 + eor x5, x5, x9 + adcs x0, x5, x0 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #0x1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x0, x0, x16 + adcs x10, x0, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #0x26 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x0, x14, #31 + mov x5, #0x13 + umaddl x5, w5, w0, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x0, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #0x13 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x17] + stp x9, x10, [x17, #16] + +// Restore stack and registers (this will zero the tops of Q8...Q15). + + ldp d8, d9, [regsave+0] + ldp d10, d11, [regsave+16] + ldp d12, d13, [regsave+32] + ldp d14, d15, [regsave+48] + ldp x19, x20, [regsave+64] + ldp x21, x22, [regsave+80] + ldp x23, x24, [regsave+96] + ldp x25, x26, [regsave+112] + ldp x27, x28, [regsave+128] + ldp x29, x30, [regsave+144] + add sp, sp, #NSPACE+160 ret #if defined(__linux__) && defined(__ELF__) diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 3e3c03371d..e6c891284d 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -1,6 +1,18 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +// ********************************************************************** +// This code is substantially derived from Emil Lenngren's implementation +// +// https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf +// https://github.com/Emill/X25519-AArch64 +// +// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// +// https://eprint.iacr.org/2022/1303.pdf +// https://github.com/slothy-optimizer/slothy/tree/main/paper +// ********************************************************************** + // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) // Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes) @@ -26,671 +38,53 @@ .text .balign 4 -// Size of individual field elements - -#define NUMSIZE 32 - -// Stable homes for the input result argument during the whole body -// and other variables that are only needed prior to the modular inverse. - -#define res x23 -#define i x20 -#define swap x21 - -// Pointers to result x coord to be written - -#define resx res, #0 - -// Pointer-offset pairs for temporaries on stack with some aliasing. - -#define scalar sp, #(0*NUMSIZE) - -#define pointx sp, #(1*NUMSIZE) +// Pointer-offset pairs for temporaries on stack -#define zm sp, #(2*NUMSIZE) -#define sm sp, #(2*NUMSIZE) -#define dpro sp, #(2*NUMSIZE) +#define scalar sp, #0 +#define pointx sp, #32 +#define mask1 sp, #72 +#define mask2 sp, #80 +#define tmpa sp, #88 +#define tmpb sp, #128 +#define xn sp, #128 +#define zn sp, #160 -#define sn sp, #(3*NUMSIZE) - -#define dm sp, #(4*NUMSIZE) - -#define zn sp, #(5*NUMSIZE) -#define dn sp, #(5*NUMSIZE) -#define e sp, #(5*NUMSIZE) - -#define dmsn sp, #(6*NUMSIZE) -#define p sp, #(6*NUMSIZE) - -#define xm sp, #(7*NUMSIZE) -#define dnsm sp, #(7*NUMSIZE) -#define spro sp, #(7*NUMSIZE) - -#define d sp, #(8*NUMSIZE) - -#define xn sp, #(9*NUMSIZE) -#define s sp, #(9*NUMSIZE) +#define res sp, #192 +#define i sp, #200 +#define swap sp, #208 // Total size to reserve on the stack -#define NSPACE (10*NUMSIZE) - -// Macro wrapping up the basic field operation bignum_mul_p25519, only -// trivially different from a pure function call to that subroutine. - -#define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// A version of multiplication that only guarantees output < 2 * p_25519. -// This basically skips the +1 and final correction in quotient estimation. - -#define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// Squaring just giving a result < 2 * p_25519, which is done by -// basically skipping the +1 in the quotient estimate and the final -// optional correction. - -#define sqr_4(P0,P1) \ - ldp x10, x11, [P1]; \ - ldp x12, x13, [P1+16]; \ - umull x2, w10, w10; \ - lsr x14, x10, #32; \ - umull x3, w14, w14; \ - umull x14, w10, w14; \ - adds x2, x2, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x3, x3, x14; \ - umull x4, w11, w11; \ - lsr x14, x11, #32; \ - umull x5, w14, w14; \ - umull x14, w11, w14; \ - mul x15, x10, x11; \ - umulh x16, x10, x11; \ - adds x4, x4, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x5, x5, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, x5, xzr; \ - adds x3, x3, x15; \ - adcs x4, x4, x16; \ - adc x5, x5, xzr; \ - umull x6, w12, w12; \ - lsr x14, x12, #32; \ - umull x7, w14, w14; \ - umull x14, w12, w14; \ - adds x6, x6, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x7, x7, x14; \ - umull x8, w13, w13; \ - lsr x14, x13, #32; \ - umull x9, w14, w14; \ - umull x14, w13, w14; \ - mul x15, x12, x13; \ - umulh x16, x12, x13; \ - adds x8, x8, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x9, x9, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x9, x9, xzr; \ - adds x7, x7, x15; \ - adcs x8, x8, x16; \ - adc x9, x9, xzr; \ - subs x10, x10, x12; \ - sbcs x11, x11, x13; \ - csetm x16, cc; \ - eor x10, x10, x16; \ - subs x10, x10, x16; \ - eor x11, x11, x16; \ - sbc x11, x11, x16; \ - adds x6, x6, x4; \ - adcs x7, x7, x5; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - umull x12, w10, w10; \ - lsr x5, x10, #32; \ - umull x13, w5, w5; \ - umull x5, w10, w5; \ - adds x12, x12, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x13, x13, x5; \ - umull x15, w11, w11; \ - lsr x5, x11, #32; \ - umull x14, w5, w5; \ - umull x5, w11, w5; \ - mul x4, x10, x11; \ - umulh x16, x10, x11; \ - adds x15, x15, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x14, x14, x5; \ - adds x4, x4, x4; \ - adcs x16, x16, x16; \ - adc x14, x14, xzr; \ - adds x13, x13, x4; \ - adcs x15, x15, x16; \ - adc x14, x14, xzr; \ - adds x4, x2, x6; \ - adcs x5, x3, x7; \ - adcs x6, x6, x8; \ - adcs x7, x7, x9; \ - csetm x16, cc; \ - subs x4, x4, x12; \ - sbcs x5, x5, x13; \ - sbcs x6, x6, x15; \ - sbcs x7, x7, x14; \ - adcs x8, x8, x16; \ - adc x9, x9, x16; \ - mov x10, #0x26; \ - umull x12, w6, w10; \ - add x12, x12, w2, uxtw; \ - lsr x2, x2, #32; \ - lsr x6, x6, #32; \ - umaddl x6, w6, w10, x2; \ - mov x2, x12; \ - umull x12, w7, w10; \ - add x12, x12, w3, uxtw; \ - lsr x3, x3, #32; \ - lsr x7, x7, #32; \ - umaddl x7, w7, w10, x3; \ - mov x3, x12; \ - umull x12, w8, w10; \ - add x12, x12, w4, uxtw; \ - lsr x4, x4, #32; \ - lsr x8, x8, #32; \ - umaddl x8, w8, w10, x4; \ - mov x4, x12; \ - umull x12, w9, w10; \ - add x12, x12, w5, uxtw; \ - lsr x5, x5, #32; \ - lsr x9, x9, #32; \ - umaddl x9, w9, w10, x5; \ - mov x5, x12; \ - lsr x13, x9, #31; \ - mov x11, #0x13; \ - umull x11, w11, w13; \ - add x2, x2, x11; \ - adds x2, x2, x6, lsl #32; \ - extr x10, x7, x6, #32; \ - adcs x3, x3, x10; \ - extr x10, x8, x7, #32; \ - adcs x4, x4, x10; \ - extr x10, x9, x8, #32; \ - lsl x11, x13, #63; \ - eor x5, x5, x11; \ - adc x5, x5, x10; \ - stp x2, x3, [P0]; \ - stp x4, x5, [P0+16] - -// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. -// This only ensures that the result fits in 4 digits, not that it is reduced -// even w.r.t. double modulus. The result is always correct modulo provided -// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided -// at least one of them is reduced double modulo. - -#define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16] - -// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 - -#define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ - stp x7, x8, [p0+16] - -// Combined z = c * x + y with reduction only < 2 * p_25519 -// where c is initially in the X1 register. It is assumed -// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a -// high mul in the final part. - -#define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ - stp x5, x6, [p0+16] - -// Multiplex: z := if NZ then x else y - -#define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0+16] +#define NSPACE 224 +#define regsave sp, #NSPACE S2N_BN_SYMBOL(curve25519_x25519_byte): -// Save regs and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, #NSPACE +// Save registers and make additional room NSPACE for temporaries. +// We only need to save the low 64-bits of the Q8...Q15 registers +// according to the ABI, so we use a save of the D8...D15 forms. + + sub sp, sp, #NSPACE+160 + stp d8, d9, [regsave+0] + stp d10, d11, [regsave+16] + stp d12, d13, [regsave+32] + stp d14, d15, [regsave+48] + stp x19, x20, [regsave+64] + stp x21, x22, [regsave+80] + stp x23, x24, [regsave+96] + stp x25, x26, [regsave+112] + stp x27, x28, [regsave+128] + stp x29, x30, [regsave+144] // Move the output pointer to a stable place - mov res, x0 + str x0, [res] -// Copy the inputs to the local variables with minimal mangling: -// -// - The scalar is in principle turned into 01xxx...xxx000 but -// in the structure below the special handling of these bits is -// explicit in the main computation; the scalar is just copied. -// -// - The point x coord is reduced mod 2^255 by masking off the -// top bit. In the main loop we only need reduction < 2 * p_25519. +// Copy the scalar to the corresponding local variable while +// mangling it. In principle it becomes 01xxx...xxx000 where +// the xxx are the corresponding bits of the original input +// scalar. We actually don't bother forcing the MSB to zero, +// but rather start the main loop below at 254 instead of 255. ldrb w10, [x1] ldrb w0, [x1, #1] @@ -722,6 +116,7 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): orr x11, x11, x0, lsl #48 ldrb w0, [x1, #15] orr x11, x11, x0, lsl #56 + bic x10, x10, #7 stp x10, x11, [scalar] ldrb w12, [x1, #16] @@ -754,223 +149,1316 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): orr x13, x13, x0, lsl #48 ldrb w0, [x1, #31] orr x13, x13, x0, lsl #56 + orr x13, x13, #0x4000000000000000 stp x12, x13, [scalar+16] - ldrb w10, [x2] +// Discard the MSB of the point X coordinate (this is in +// accordance with the RFC, mod 2^255, *not* 2^255-19). +// Then recode it into the unsaturated base 25.5 form. + + ldrb w4, [x2] ldrb w0, [x2, #1] - orr x10, x10, x0, lsl #8 + orr x4, x4, x0, lsl #8 ldrb w0, [x2, #2] - orr x10, x10, x0, lsl #16 + orr x4, x4, x0, lsl #16 ldrb w0, [x2, #3] - orr x10, x10, x0, lsl #24 + orr x4, x4, x0, lsl #24 ldrb w0, [x2, #4] - orr x10, x10, x0, lsl #32 + orr x4, x4, x0, lsl #32 ldrb w0, [x2, #5] - orr x10, x10, x0, lsl #40 + orr x4, x4, x0, lsl #40 ldrb w0, [x2, #6] - orr x10, x10, x0, lsl #48 + orr x4, x4, x0, lsl #48 ldrb w0, [x2, #7] - orr x10, x10, x0, lsl #56 - ldrb w11, [x2, #8] + orr x4, x4, x0, lsl #56 + ldrb w5, [x2, #8] ldrb w0, [x2, #9] - orr x11, x11, x0, lsl #8 + orr x5, x5, x0, lsl #8 ldrb w0, [x2, #10] - orr x11, x11, x0, lsl #16 + orr x5, x5, x0, lsl #16 ldrb w0, [x2, #11] - orr x11, x11, x0, lsl #24 + orr x5, x5, x0, lsl #24 ldrb w0, [x2, #12] - orr x11, x11, x0, lsl #32 + orr x5, x5, x0, lsl #32 ldrb w0, [x2, #13] - orr x11, x11, x0, lsl #40 + orr x5, x5, x0, lsl #40 ldrb w0, [x2, #14] - orr x11, x11, x0, lsl #48 + orr x5, x5, x0, lsl #48 ldrb w0, [x2, #15] - orr x11, x11, x0, lsl #56 - stp x10, x11, [pointx] + orr x5, x5, x0, lsl #56 - ldrb w12, [x2, #16] + ldrb w6, [x2, #16] ldrb w0, [x2, #17] - orr x12, x12, x0, lsl #8 + orr x6, x6, x0, lsl #8 ldrb w0, [x2, #18] - orr x12, x12, x0, lsl #16 + orr x6, x6, x0, lsl #16 ldrb w0, [x2, #19] - orr x12, x12, x0, lsl #24 + orr x6, x6, x0, lsl #24 ldrb w0, [x2, #20] - orr x12, x12, x0, lsl #32 + orr x6, x6, x0, lsl #32 ldrb w0, [x2, #21] - orr x12, x12, x0, lsl #40 + orr x6, x6, x0, lsl #40 ldrb w0, [x2, #22] - orr x12, x12, x0, lsl #48 + orr x6, x6, x0, lsl #48 ldrb w0, [x2, #23] - orr x12, x12, x0, lsl #56 - ldrb w13, [x2, #24] + orr x6, x6, x0, lsl #56 + ldrb w7, [x2, #24] ldrb w0, [x2, #25] - orr x13, x13, x0, lsl #8 + orr x7, x7, x0, lsl #8 ldrb w0, [x2, #26] - orr x13, x13, x0, lsl #16 + orr x7, x7, x0, lsl #16 ldrb w0, [x2, #27] - orr x13, x13, x0, lsl #24 + orr x7, x7, x0, lsl #24 ldrb w0, [x2, #28] - orr x13, x13, x0, lsl #32 + orr x7, x7, x0, lsl #32 ldrb w0, [x2, #29] - orr x13, x13, x0, lsl #40 + orr x7, x7, x0, lsl #40 ldrb w0, [x2, #30] - orr x13, x13, x0, lsl #48 + orr x7, x7, x0, lsl #48 ldrb w0, [x2, #31] - orr x13, x13, x0, lsl #56 - and x13, x13, #0x7fffffffffffffff + orr x7, x7, x0, lsl #56 + + lsr x12, x4, #51 + lsr x17, x6, #51 + orr x12, x12, x5, lsl #13 + orr x17, x17, x7, lsl #13 + ubfx x8, x7, #12, #26 + ubfx x9, x7, #38, #25 + ubfx x11, x4, #26, #25 + ubfx x13, x5, #13, #25 + lsr x14, x5, #38 + ubfx x16, x6, #25, #26 + and x10, x4, #0x3ffffff + and x12, x12, #0x3ffffff + and x15, x6, #0x1ffffff + and x17, x17, #0x1ffffff + orr x10, x10, x11, lsl #32 + orr x11, x12, x13, lsl #32 + orr x12, x14, x15, lsl #32 + orr x13, x16, x17, lsl #32 + orr x14, x8, x9, lsl #32 + + stp x10, x11, [pointx+0] stp x12, x13, [pointx+16] + str x14, [pointx+32] + +// Initialize (X2,Z2) = (1,0), the identity (projective point at infinity) + + mov x1, #1 + mov v0.d[0], x1 + mov v2.d[0], xzr + mov v4.d[0], xzr + mov v6.d[0], xzr + mov v8.d[0], xzr + + mov v1.d[0], xzr + mov v3.d[0], xzr + mov v5.d[0], xzr + mov v7.d[0], xzr + mov v9.d[0], xzr + +// Initialize (X3,Z3) = (X,1), projective representation of X + + mov v10.d[0], x10 + mov v12.d[0], x11 + mov v14.d[0], x12 + mov v16.d[0], x13 + mov v18.d[0], x14 + + mov v11.d[0], x1 + mov v13.d[0], xzr + mov v15.d[0], xzr + mov v17.d[0], xzr + mov v19.d[0], xzr + +// Set up some constants used repeatedly in the main loop: +// +// Q31 = 0x1300000013 (two 32-bit copies of 19) +// Q30 = 0x3ffffff0000000003ffffff (two 64-bit copies of 2^26-1) +// Q29 = mask1 = (0x07ffffc,0x07fffffe) +// Q28 = mask2 = (0x07ffffb4,0x07fffffe) -// Initialize with explicit doubling in order to handle set bit 254. -// Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1). -// We use the fact that the point x coordinate is still in registers. -// Since zm = 1 we could do the doubling with an operation count of -// 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth -// the slight complication arising from a different linear combination. - - mov swap, #1 - stp x10, x11, [xm] - stp x12, x13, [xm+16] - stp swap, xzr, [zm] - stp xzr, xzr, [zm+16] - - sub_twice4(d,xm,zm) - add_twice4(s,xm,zm) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - -// The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive). -// This is a classic Montgomery ladder, with the main coordinates only -// reduced mod 2 * p_25519, some intermediate results even more loosely. - - mov i, #253 - -curve25519_x25519_byte_scalarloop: - -// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn - - sub_twice4(dm,xm,zm) - add_twice4(sn,xn,zn) - sub_twice4(dn,xn,zn) - add_twice4(sm,xm,zm) - -// ADDING: dmsn = dm * sn -// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) - - mul_4(dmsn,sn,dm) - - lsr x0, i, #6 - ldr x2, [sp, x0, lsl #3] // Exploiting scalar = sp exactly - lsr x2, x2, i - and x2, x2, #1 - - cmp swap, x2 - mov swap, x2 - - mux_4(d,dm,dn) - mux_4(s,sm,sn) - -// ADDING: dnsm = sm * dn - - mul_4(dnsm,sm,dn) - -// DOUBLING: d = (xt - zt)^2 - - sqr_4(d,d) - -// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 -// DOUBLING: s = (xt + zt)^2 - - sub_twice4(dpro,dmsn,dnsm) - sqr_4(s,s) - add_twice4(spro,dmsn,dnsm) - sqr_4(dpro,dpro) - -// DOUBLING: p = 4 * xt * zt = s - d - - sub_twice4(p,s,d) - -// ADDING: xm' = (dmsn + dnsm)^2 - - sqr_4(xm,spro) - -// DOUBLING: e = 121666 * p + d - - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) + mov w0, #19 + add x0, x0, x0, lsl #32 + mov v31.d[0], x0 + mov v31.d[1], xzr -// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + mov x0, #(1<<26)-1 + mov v30.d[0], x0 + mov v30.d[1], x0 - mul_4(xn,s,d) + mov x0, #0x07fffffe07fffffe + sub x1, x0, #0xfe-0xb4 + sub x0, x0, #2 -// ADDING: zm' = x * (dmsn - dnsm)^2 + stp x0, x1, [mask1] + ldp d29, d28, [mask1] - mul_4(zm,dpro,pointx) +// The main loop over (modified) bits from i = 254, ..., i = 0 (inclusive); +// we explicitly skip bit 255 because it should be forced to zero initially. +// This is a classic Montgomery ladder using a "swap" variable. +// It's assumed x0 = i at the start of the loop, but that is volatile and +// needs to be reloaded from memory at the end of the loop. -// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) -// = p * (d + 121666 * p) + str xzr, [swap] + mov x0, #254 + str x0, [i] - mul_4(zn,p,e) +curve25519_x25519_byte_scalarloop: -// Loop down as far as 3 (inclusive) + lsr x1, x0, #6 + ldr x2, [sp, x1, lsl #3] // Exploiting scalar = sp exactly + lsr x2, x2, x0 + and x2, x2, #1 - sub i, i, #1 - cmp i, #3 + ldr x0, [swap] + cmp x0, x2 + str x2, [swap] + +// The following inner loop code is derived closely following Lenngren's +// implementation available at "https://github.com/Emill/X25519-AArch64". +// In particular, the basic dataflow and the organization between integer +// and SIMD units is identical, with only a few minor changes to some +// individual instructions (for miscellaneous reasons). The scheduling +// was redone from scratch by SLOTHY starting from Hanno Becker's +// un-interleaved form and using the same scripts as in Becker et al's +// paper. +// +// The intermediate value annotations were added to provide data that +// is used in the formal proof, indicating which lines assign specific +// digits of the various intermediate results (mainly of field +// operations, sometimes other transformations). The names used for +// the intermediate results are similar but not identical to those in +// the abstract Algorithm 1 description in Lenngren's paper. Almost +// all equations are to be interpreted as field operations, i.e. as +// arithmetic modulo 2^255-19, not simple numeric equalities. +// +// b = x2 - z2 +// d = x3 - z3 +// a = x2 + z2 +// c = x3 + z3 +// f = if flip then c else a +// g = if flip then d else b +// aa = f^2 +// bb = g^2 +// bbalt = bb (change of representation) +// e = aa - bb +// bce = bbalt + 121666 * e +// z4 = bce * e +// bc = b * c +// ad = a * d +// t1 = ad + bc +// t2 = ad - bc +// x5 = t1^2 +// t3 = t2^2 +// x4 = aa * bb +// z5 = x * t3 +// +// Then the main variables are updated for the next iteration as +// +// (x2',z2') = (x4,z4) +// (x3',z3') = (x5,z5) + + add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2S, v28.2S, v1.2S + add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2S, v29.2S, v3.2S + add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2S, v29.2S, v15.2S + sub v1.2S, v29.2S, v5.2S + sub v26.2S, v28.2S, v11.2S + sub v21.2S, v29.2S, v19.2S + add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2S, v29.2S, v17.2S + add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2S, v29.2S, v13.2S + sub v13.2S, v29.2S, v7.2S + add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2S, v29.2S, v9.2S + add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f + add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f + add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f + mov x0, v20.d[0] + fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f + mov x21, v12.d[0] + fcsel d29, d7, d21, eq // ubignum_of_qreglist 4 // INTERMEDIATE g + mov x5, v16.d[0] + lsr x26, x0, #32 + add x29, x21, x21 + umull x15, w5, w29 + add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add x12, x26, x26 + mov x30, v5.d[0] + fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g + lsr x11, x5, #32 + lsr x10, x30, #32 + trn2 v20.2S, v21.2S, v3.2S + add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + add x14, x11, x11 + trn2 v6.2S, v2.2S, v15.2S + trn1 v12.2S, v25.2S, v0.2S + add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2S, v23.2S, v13.2S + fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g + trn2 v17.2S, v27.2S, v24.2S + str d29, [tmpb+32] + add x17, x10, x10 + trn2 v4.2S, v28.2S, v1.2S + trn1 v5.2S, v28.2S, v1.2S + trn1 v28.2S, v2.2S, v15.2S + trn1 v2.2S, v22.2S, v18.2S + fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g + trn2 v15.2S, v22.2S, v18.2S + umull v22.2D, v12.2S, v20.2S + umull x22, w30, w17 + stp d29, d10, [tmpb+0] + trn2 v10.2S, v23.2S, v13.2S + trn2 v23.2S, v11.2S, v14.2S + trn1 v13.2S, v27.2S, v24.2S + fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g + trn1 v14.2S, v11.2S, v14.2S + umlal v22.2D, v2.2S, v6.2S + umull x25, w30, w30 + umlal v22.2D, v5.2S, v23.2S + add x3, x30, x30 + umlal v22.2D, v16.2S, v17.2S + add w30, w21, w21, lsl #1; + stp d27, d8, [tmpb+16] + add w30, w30, w21, lsl #4 + trn1 v11.2S, v26.2S, v19.2S + trn2 v8.2S, v26.2S, v19.2S + trn2 v19.2S, v25.2S, v0.2S + mul v29.2S, v20.2S, v31.2S + ldr x20, [tmpb+24] + umull v25.2D, v19.2S, v6.2S + add x1, x0, x0 + umull v27.2D, v19.2S, v23.2S + umull x9, w5, w1 + umull v0.2D, v12.2S, v23.2S + lsr x24, x20, #32 + mul v20.2S, v23.2S, v31.2S + lsr x16, x21, #32 + umlal v25.2D, v15.2S, v23.2S + umaddl x13, w11, w14, x9 + umlal v25.2D, v4.2S, v17.2S + umaddl x9, w14, w17, x15 + umull v24.2D, v12.2S, v6.2S + add w2, w16, w16, lsl #1; + fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f + add w2, w2, w16, lsl #4 + trn1 v18.2S, v21.2S, v3.2S + umull v3.2D, v19.2S, v29.2S + umull x28, w5, w3 + mul v1.2S, v6.2S, v31.2S + umull x8, w5, w5 + umlal v24.2D, v2.2S, v23.2S + umaddl x13, w21, w30, x13 + mul v23.2S, v17.2S, v31.2S + umaddl x27, w14, w12, x28 + trn2 v6.2S, v9.2S, v7.2S + mov x6, v26.d[0] + umlal v3.2D, v15.2S, v1.2S + add x16, x16, x16 + umlal v3.2D, v4.2S, v20.2S + lsr x4, x6, #32 + umlal v3.2D, v10.2S, v23.2S + add x7, x6, x6 + umull v26.2D, v19.2S, v8.2S + add x23, x4, x4 + umaddl x28, w5, w23, x22 + trn1 v7.2S, v9.2S, v7.2S + umlal v27.2D, v15.2S, v17.2S + add w15, w4, w4, lsl #1; + umlal v27.2D, v4.2S, v8.2S + add w15, w15, w4, lsl #4 + add w22, w10, w10, lsl #1; + umlal v24.2D, v5.2S, v17.2S + add w22, w22, w10, lsl #4 + umaddl x10, w11, w7, x28 + umlal v25.2D, v10.2S, v8.2S + umull x21, w5, w16 + umlal v25.2D, v6.2S, v29.2S + umaddl x23, w15, w23, x25 + umlal v27.2D, v10.2S, v29.2S + umull x19, w5, w12 + umlal v27.2D, v6.2S, v1.2S + umaddl x25, w11, w29, x21 + umlal v0.2D, v2.2S, v17.2S + umaddl x28, w0, w3, x9 + shl v21.2D, v25.2D, #1 + umaddl x4, w11, w1, x19 + umaddl x21, w2, w29, x4 + mul v25.2S, v8.2S, v31.2S + umlal v24.2D, v16.2S, v8.2S + umaddl x19, w0, w17, x25 + umlal v24.2D, v7.2S, v29.2S + umull x25, w5, w17 + umlal v24.2D, v19.2S, v28.2S + umaddl x4, w0, w16, x10 + umull v9.2D, v12.2S, v8.2S + umaddl x23, w5, w7, x23 + umlal v21.2D, v12.2S, v18.2S + add w10, w6, w6, lsl #1; + shl v27.2D, v27.2D, #1 + add w10, w10, w6, lsl #4 + umaddl x28, w26, w12, x28 + umlal v26.2D, v15.2S, v29.2S + umaddl x9, w14, w16, x23 + umlal v9.2D, v2.2S, v29.2S + umaddl x22, w22, w17, x8 + umlal v21.2D, v2.2S, v28.2S + umaddl x28, w6, w10, x28 + umaddl x27, w0, w0, x27 + add x8, x14, x14 + umlal v0.2D, v5.2S, v8.2S + umull x5, w5, w14 + umlal v9.2D, v5.2S, v1.2S + umaddl x14, w0, w29, x9 + umlal v26.2D, v4.2S, v1.2S + umaddl x6, w2, w16, x27 + umlal v22.2D, v7.2S, v8.2S + umaddl x5, w30, w17, x5 + umaddl x5, w2, w3, x5 + add x23, x17, x17 + umlal v27.2D, v12.2S, v28.2S + umaddl x13, w2, w23, x13 + umlal v26.2D, v10.2S, v20.2S + add x9, x12, x12 + umlal v9.2D, v16.2S, v20.2S + umaddl x27, w10, w29, x6 + umlal v0.2D, v16.2S, v29.2S + umaddl x6, w11, w3, x25 + umlal v22.2D, v19.2S, v18.2S + umaddl x19, w26, w3, x19 + mul v18.2S, v18.2S, v31.2S + umaddl x23, w15, w23, x27 + umlal v3.2D, v6.2S, v25.2S + umaddl x0, w0, w12, x6 + umlal v0.2D, v7.2S, v1.2S + add x11, x16, x16 + umlal v9.2D, v7.2S, v23.2S + umaddl x6, w12, w17, x14 + umlal v9.2D, v19.2S, v11.2S + umaddl x25, w26, w29, x4 + umlal v9.2D, v15.2S, v18.2S + umaddl x14, w10, w3, x13 + umull v25.2D, v12.2S, v17.2S + umaddl x27, w10, w16, x0 + umlal v26.2D, v6.2S, v23.2S + add x0, x25, x6, lsr #26 + mul v23.2S, v28.2S, v31.2S + umaddl x12, w10, w12, x5 + shl v3.2D, v3.2D, #1 + add x16, x22, x0, lsr #25 + umlal v21.2D, v5.2S, v14.2S + bic x22, x0, #0x1ffffff + umlal v3.2D, v12.2S, v11.2S + add x26, x16, x22, lsr #24 + umlal v3.2D, v2.2S, v18.2S + umaddl x16, w10, w17, x21 + umlal v3.2D, v5.2S, v23.2S + add x22, x26, x22, lsr #21 + umlal v9.2D, v4.2S, v23.2S + umaddl x5, w15, w29, x27 + umull v17.2D, v19.2S, v17.2S + umaddl x17, w30, w3, x22 + umlal v25.2D, v2.2S, v8.2S + umaddl x25, w15, w3, x16 + umlal v25.2D, v5.2S, v29.2S + umaddl x26, w15, w7, x19 + umlal v0.2D, v19.2S, v14.2S + umaddl x17, w2, w9, x17 + umlal v17.2D, v15.2S, v8.2S + ldr x19, [tmpb+0] + umlal v17.2D, v4.2S, v29.2S + ldr x7, [tmpb+8] + shl v29.2D, v26.2D, #1 + umaddl x13, w10, w1, x17 + umlal v0.2D, v15.2S, v13.2S + lsr x2, x19, #32 + umlal v29.2D, v12.2S, v13.2S + umaddl x27, w15, w1, x12 + umlal v29.2D, v2.2S, v11.2S + umaddl x30, w15, w8, x13 + umlal v29.2D, v5.2S, v18.2S + add x4, x7, x7 + umlal v29.2D, v16.2S, v23.2S + umaddl x29, w15, w9, x14 + umlal v0.2D, v4.2S, v11.2S + add x17, x27, x30, lsr #26 + umlal v0.2D, v10.2S, v18.2S + umaddl x16, w15, w11, x28 + umlal v0.2D, v6.2S, v23.2S + add x1, x29, x17, lsr #25 + umlal v25.2D, v16.2S, v1.2S + umull x11, w19, w4 + ldr x8, [tmpb+32] + mul v26.2S, v14.2S, v31.2S + umlal v17.2D, v10.2S, v1.2S + ldr x15, [tmpb+16] + umlal v17.2D, v6.2S, v20.2S + and x9, x30, #0x3ffffff + bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa + add x17, x2, x2 + lsr x10, x15, #32 + add x27, x25, x1, lsr #26 + umlal v25.2D, v7.2S, v20.2S + add x13, x10, x10 + umlal v25.2D, v19.2S, v13.2S + add x29, x23, x27, lsr #25 + umlal v25.2D, v15.2S, v11.2S + lsr x30, x8, #32 + umlal v25.2D, v4.2S, v18.2S + add x23, x5, x29, lsr #26 + umlal v25.2D, v10.2S, v23.2S + and x14, x29, #0x3ffffff + umlal v25.2D, v6.2S, v26.2S + add x5, x16, x23, lsr #25 + shl v8.2D, v17.2D, #1 + umaddl x12, w2, w17, x11 + and x29, x5, #0x3ffffff + umull x21, w19, w19 + umlal v29.2D, v7.2S, v26.2S + add w16, w10, w10, lsl #1; + umlal v3.2D, v16.2S, v26.2S + add w16, w16, w10, lsl #4 + bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa + add w10, w24, w24, lsl #1; + add x22, x26, x5, lsr #26 + add w10, w10, w24, lsl #4 + umlal v8.2D, v12.2S, v14.2S + umaddl x25, w16, w13, x21 + umlal v8.2D, v2.2S, v13.2S + bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa + umlal v8.2D, v5.2S, v11.2S + add x26, x24, x24 + umlal v8.2D, v16.2S, v18.2S + stp x14, x29, [tmpa+16] + umlal v8.2D, v7.2S, v23.2S + add w24, w30, w30, lsl #1; + usra v25.2D, v29.2D, #26 + add w24, w24, w30, lsl #4 + umull x29, w15, w15 + umlal v27.2D, v2.2S, v14.2S + umull x3, w15, w13 + umlal v27.2D, v5.2S, v13.2S + add x21, x20, x20 + umlal v24.2D, v15.2S, v14.2S + umull x5, w19, w21 + umlal v24.2D, v4.2S, v13.2S + and x11, x1, #0x3ffffff + usra v8.2D, v25.2D, #25 + and x1, x0, #0x1ffffff + umlal v27.2D, v16.2S, v11.2S + umaddl x23, w17, w13, x5 + umlal v27.2D, v7.2S, v18.2S + add x5, x30, x30 + usra v0.2D, v8.2D, #26 + add x0, x15, x15 + umlal v24.2D, v10.2S, v11.2S + umaddl x23, w7, w0, x23 + umlal v24.2D, v6.2S, v18.2S + lsr x30, x7, #32 + usra v27.2D, v0.2D, #25 + add x16, x30, x30 + and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + umaddl x15, w30, w16, x23 + ushr v23.2D, v30.2D, #1 + add w23, w8, w8, lsl #1; + usra v24.2D, v27.2D, #26 + add w23, w23, w8, lsl #4 + umaddl x14, w19, w5, x3 + and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + add x28, x8, x8 + and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + umaddl x8, w8, w23, x15 + and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + umaddl x3, w2, w28, x14 + umlal v22.2D, v15.2S, v28.2S + bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa + uzp1 v5.4S, v8.4S, v5.4S + umaddl x14, w24, w5, x29 + umaddl x5, w19, w28, x14 + ldr d18, [mask1] + mov v18.d[1], v18.d[0] + umaddl x15, w7, w26, x3 + mul v12.2S, v13.2S, v31.2S + umlal v21.2D, v16.2S, v13.2S + stp x9, x11, [tmpa+0] + umlal v21.2D, v7.2S, v11.2S + umaddl x29, w17, w26, x5 + umlal v22.2D, v4.2S, v14.2S + add w14, w20, w20, lsl #1; + umlal v22.2D, v10.2S, v13.2S + add w14, w14, w20, lsl #4 + umull x3, w19, w0 + umlal v22.2D, v6.2S, v11.2S + umaddl x29, w7, w21, x29 + usra v21.2D, v24.2D, #25 + umaddl x11, w20, w14, x12 + and v0.16B, v25.16B, v23.16B + umaddl x5, w30, w21, x15 + and v14.16B, v29.16B, v30.16B + umaddl x12, w16, w13, x29 + usra v22.2D, v21.2D, #26 + umaddl x29, w17, w16, x3 + umlal v3.2D, v7.2S, v12.2S + add x9, x26, x26 + and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + add x27, x5, x12, lsr #26 + bic v8.16B, v22.16B, v23.16B + umaddl x29, w7, w7, x29 + and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + add x5, x25, x27, lsr #25 + usra v3.2D, v8.2D, #25 + umaddl x25, w24, w9, x8 + umlal v9.2D, v10.2S, v26.2S + add x8, x13, x13 + trn1 v22.4S, v1.4S, v17.4S + umaddl x11, w10, w8, x11 + usra v3.2D, v8.2D, #24 + umull x20, w19, w16 + add v26.2S, v22.2S, v18.2S + ldr d28, [mask2] + umlal v9.2D, v6.2S, v12.2S + umaddl x3, w23, w0, x11 + usra v3.2D, v8.2D, #21 + umaddl x29, w10, w26, x29 + uzp1 v11.4S, v20.4S, v27.4S + umaddl x20, w2, w4, x20 + umaddl x9, w10, w21, x20 + mov v17.d[0], v22.d[1] + usra v9.2D, v3.2D, #26 + umull x15, w19, w13 + and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + add x11, x16, x16 + uzp2 v1.4S, v11.4S, v5.4S + umaddl x20, w23, w13, x9 + and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + umaddl x9, w2, w0, x15 + usra v14.2D, v9.2D, #25 + and x6, x6, #0x3ffffff + uzp1 v7.4S, v7.4S, v8.4S + umaddl x29, w23, w21, x29 + uzp1 v27.4S, v11.4S, v5.4S + umull x15, w19, w26 + usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + add x6, x6, x22, lsr #25 + and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + bic x22, x27, #0x1ffffff + sub v2.2S, v26.2S, v17.2S + add v9.2S, v22.2S, v17.2S + uzp1 v14.4S, v3.4S, v0.4S + umaddl x2, w2, w21, x15 + add v5.4S, v27.4S, v18.4S + add x5, x5, x22, lsr #24 + zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + mov v18.b[0], v28.b[0] + uzp1 v8.4S, v7.4S, v14.4S + add x22, x5, x22, lsr #21 + uzp2 v3.4S, v7.4S, v14.4S + umaddl x5, w7, w16, x9 + add v25.4S, v8.4S, v18.4S + umaddl x15, w14, w0, x22 + add v12.4S, v27.4S, v1.4S + add x9, x17, x17 + sub v14.4S, v5.4S, v1.4S + umull x19, w19, w17 + sub v18.4S, v25.4S, v3.4S + ldr x22, [tmpa+8] + add v20.4S, v8.4S, v3.4S + umaddl x15, w10, w11, x15 + zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + umaddl x14, w14, w13, x19 + zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + and x17, x27, #0x1ffffff + zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + umaddl x15, w23, w4, x15 + zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + umaddl x10, w10, w0, x14 + zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2S, v0.2S, #1 + mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 + shl v26.2S, v22.2S, #1 + shl v17.2S, v16.2S, #1 + mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 + shl v7.2S, v5.2S, #1 + shl v18.2S, v19.2S, #1 + umull v11.2D, v1.2S, v24.2S + umaddl x19, w23, w16, x10 + umull v6.2D, v1.2S, v17.2S + umaddl x10, w7, w13, x2 + mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 + mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 + umull v9.2D, v1.2S, v26.2S + ldr x13, [tmpa+0] + shl v28.2S, v15.2S, #1 + shl v3.2S, v10.2S, #1 + ldr x14, [tmpa+16] + mul v12.2S, v10.2S, v31.2S + umull v25.2D, v1.2S, v7.2S + ldr x2, [tmpa+24] + umlal v6.2D, v18.2S, v28.2S + umaddl x27, w30, w0, x10 + umaddl x16, w24, w0, x20 + shl v13.2S, v14.2S, #1 + umaddl x5, w23, w26, x5 + mul v2.2S, v22.2S, v31.2S + umull v21.2D, v1.2S, v13.2S + umaddl x23, w24, w8, x29 + umlal v11.2D, v18.2S, v19.2S + mov x10, #0x07fffffe07fffffe + sub x10, x10, #2 + umaddl x26, w24, w21, x5 + mul v29.2S, v14.2S, v31.2S + umlal v25.2D, v19.2S, v26.2S + add x7, x1, x6, lsr #26 + mul v20.2S, v4.2S, v31.2S + and x6, x6, #0x3ffffff + shl v8.2S, v18.2S, #1 + shl v4.2S, v4.2S, #1 + umlal v11.2D, v29.2S, v14.2S + bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa + umlal v25.2D, v0.2S, v3.2S + umaddl x0, w24, w4, x19 + umlal v25.2D, v15.2S, v13.2S + str x6, [tmpa+32] + umlal v21.2D, v18.2S, v4.2S + umaddl x8, w24, w11, x3 + umlal v21.2D, v0.2S, v17.2S + ldr x30, [tmpa+32] + mul v14.2S, v5.2S, v31.2S + add x2, x2, x10 + shl v5.2S, v28.2S, #1 + shl v27.2S, v4.2S, #1 + umlal v6.2D, v0.2S, v0.2S + umaddl x11, w24, w9, x15 + umlal v6.2D, v12.2S, v3.2S + add x4, x30, x10 + umlal v11.2D, v14.2S, v5.2S + add x3, x22, x10 + umlal v11.2D, v2.2S, v17.2S + add x6, x0, x11, lsr #26 + umlal v11.2D, v12.2S, v27.2S + add x14, x14, x10 + umlal v6.2D, v14.2S, v27.2S + add x8, x8, x6, lsr #25 + umlal v6.2D, v2.2S, v13.2S + movk x10, #0xffb4 + umlal v25.2D, v16.2S, v4.2S + add x29, x16, x8, lsr #26 + umull v27.2D, v1.2S, v3.2S + and x11, x11, #0x3ffffff + umlal v9.2D, v18.2S, v3.2S + add x19, x13, x10 + umlal v9.2D, v0.2S, v13.2S + and x5, x8, #0x3ffffff + umlal v9.2D, v28.2S, v4.2S + bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb + umlal v9.2D, v16.2S, v16.2S + umaddl x30, w24, w28, x27 + umlal v9.2D, v14.2S, v7.2S + sub x13, x19, x11 + umull v10.2D, v1.2S, v18.2S + add x7, x23, x29, lsr #25 + umlal v21.2D, v28.2S, v15.2S + lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e + umlal v21.2D, v2.2S, v22.2S + add x0, x26, x7, lsr #26 + usra v25.2D, v9.2D, #26 + and x20, x7, #0x3ffffff + umull v22.2D, v1.2S, v1.2S + add x8, x25, x0, lsr #25 + umull v7.2D, v1.2S, v28.2S + and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt + bic v18.16B, v25.16B, v23.16B + and x19, x8, #0x3ffffff + and v16.16B, v9.16B, v30.16B + and x7, x12, #0x3ffffff + usra v22.2D, v18.2D, #25 + add x10, x30, x8, lsr #26 + umlal v7.2D, v19.2S, v24.2S + bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb + and v9.16B, v25.16B, v23.16B + add x27, x7, x10, lsr #25 + usra v22.2D, v18.2D, #24 + mov x21, #60833 + lsl x21, x21, #1 + add x15, x17, x27, lsr #26 + shl v25.2S, v3.2S, #1 + umlal v7.2D, v14.2S, v17.2S + and x29, x27, #0x3ffffff + usra v22.2D, v18.2D, #21 + bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt + umlal v10.2D, v14.2S, v24.2S + and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt + umlal v10.2D, v2.2S, v28.2S + sub x6, x3, x5 + umlal v10.2D, v12.2S, v17.2S + umaddl x25, w16, w21, x17 + umlal v10.2D, v29.2S, v4.2S + mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt + umlal v22.2D, v20.2S, v4.2S + lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e + umlal v22.2D, v14.2S, v8.2S + and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt + umlal v22.2D, v2.2S, v24.2S + stp x11, x5, [tmpb+0] + umlal v22.2D, v12.2S, v5.2S + bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb + umlal v22.2D, v29.2S, v17.2S + umaddl x12, w6, w21, x12 + umull v18.2D, v1.2S, v4.2S + bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb + umlal v7.2D, v2.2S, v4.2S + sub x7, x14, x20 + umlal v27.2D, v19.2S, v13.2S + mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt + usra v10.2D, v22.2D, #26 + lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e + umlal v18.2D, v19.2S, v17.2S + and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt + umlal v7.2D, v12.2S, v13.2S + sub x5, x2, x19 + usra v11.2D, v10.2D, #25 + mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt + umlal v27.2D, v0.2S, v4.2S + umlal v21.2D, v14.2S, v25.2S + sub x23, x4, x29 + usra v7.2D, v11.2D, #26 + mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt + umlal v18.2D, v0.2S, v28.2S + lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e + umlal v27.2D, v15.2S, v17.2S + str x29, [tmpb+32] + usra v6.2D, v7.2D, #25 + mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt + and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + umaddl x27, w26, w21, x1 + umlal v18.2D, v14.2S, v13.2S + umaddl x30, w23, w21, x0 + umlal v18.2D, v2.2S, v3.2S + lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e + and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + umaddl x4, w14, w21, x24 + ldr x0, [tmpa+0] + mov v0.s[1], w0 + lsr x0, x0, #32 + mov v1.s[1], w0 + umaddl x9, w7, w21, x8 + usra v18.2D, v6.2D, #26 + umaddl x24, w10, w21, x28 + and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + umaddl x8, w22, w21, x15 + umlal v27.2D, v14.2S, v26.2S + umaddl x15, w13, w21, x17 + usra v21.2D, v18.2D, #25 + stp x20, x19, [tmpb+16] + and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + lsr x29, x8, #25 + ldr x3, [tmpb+0] + mov v10.s[1], w3 + lsr x3, x3, #32 + mov v11.s[1], w3 + add x17, x15, x29 + usra v27.2D, v21.2D, #26 + add x28, x17, x29, lsl #1 + and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and x20, x8, #0x1ffffff + and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + add x17, x28, x29, lsl #4 + and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + ldr x3, [tmpb+8] + mov v22.s[1], w3 + lsr x3, x3, #32 + mov v23.s[1], w3 + add x29, x25, x17, lsr #26 + ldr x15, [pointx+0] + mov v10.s[0], w15 + lsr x15, x15, #32 + mov v11.s[0], w15 + and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce + usra v16.2D, v27.2D, #25 + add x8, x12, x29, lsr #25 + ldr x3, [tmpb+16] + mov v14.s[1], w3 + lsr x3, x3, #32 + mov v15.s[1], w3 + and x12, x29, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bce + ldr x15, [pointx+8] + mov v22.s[0], w15 + lsr x15, x15, #32 + mov v23.s[0], w15 + add x28, x27, x8, lsr #26 + and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + umull x1, w12, w10 + ldr x3, [tmpb+24] + mov v17.s[1], w3 + lsr x3, x3, #32 + mov v18.s[1], w3 + add x25, x9, x28, lsr #25 + ldr x15, [pointx+16] + mov v14.s[0], w15 + lsr x15, x15, #32 + mov v15.s[0], w15 + umaddl x19, w5, w21, x2 + usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + add x2, x4, x25, lsr #26 + ldr x3, [tmpb+32] + mov v24.s[1], w3 + lsr x3, x3, #32 + mov v25.s[1], w3 + umull x3, w12, w23 + ldr x15, [pointx+24] + mov v17.s[0], w15 + lsr x15, x15, #32 + mov v18.s[0], w15 + add x29, x19, x2, lsr #25 + umull v26.2D, v0.2S, v23.2S + and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce + ldr x0, [tmpa+8] + mov v2.s[1], w0 + lsr x0, x0, #32 + mov v3.s[1], w0 + umaddl x27, w21, w5, x3 + ldr x15, [pointx+32] + mov v24.s[0], w15 + lsr x15, x15, #32 + mov v25.s[0], w15 + add x17, x24, x29, lsr #26 + umull v29.2D, v1.2S, v18.2S + and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce + umull v20.2D, v0.2S, v15.2S + add x19, x30, x17, lsr #25 + and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce + mul v12.2S, v25.2S, v31.2S + ldr x0, [tmpa+16] + mov v4.s[1], w0 + lsr x0, x0, #32 + mov v5.s[1], w0 + add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v26.2D, v2.2S, v11.2S + add w28, w3, w3, lsl #1; + umlal v20.2D, v2.2S, v23.2S + add w28, w28, w3, lsl #4 + umull x8, w12, w5 + ldr x0, [tmpa+24] + mov v6.s[1], w0 + lsr x0, x0, #32 + mov v7.s[1], w0 + and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce + mul v16.2S, v18.2S, v31.2S + add w17, w4, w4, lsl #1; + umull v21.2D, v1.2S, v15.2S + add w17, w17, w4, lsl #4 + umaddl x25, w21, w7, x8 + umlal v20.2D, v4.2S, v11.2S + add w8, w21, w21, lsl #1; + ldr x0, [tmpa+32] + add w8, w8, w21, lsl #4 + mov v8.s[1], w0 + lsr x0, x0, #32 + mov v9.s[1], w0 + and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce + umlal v29.2D, v3.2S, v15.2S + umaddl x24, w2, w6, x25 + umull v13.2D, v0.2S, v25.2S + umaddl x25, w2, w7, x27 + umaddl x0, w3, w6, x25 + mul v19.2S, v15.2S, v31.2S + umull v27.2D, v0.2S, v18.2S + umaddl x20, w3, w13, x24 + umlal v20.2D, v6.2S, v12.2S + umaddl x24, w21, w14, x1 + umlal v13.2D, v2.2S, v18.2S + umaddl x9, w4, w13, x0 + umull v25.2D, v0.2S, v11.2S + umaddl x20, w17, w23, x20 + umlal v27.2D, v2.2S, v15.2S + umaddl x0, w2, w26, x24 + umull v28.2D, v1.2S, v11.2S + umull x24, w17, w5 + umlal v29.2D, v5.2S, v23.2S + umaddl x9, w11, w22, x9 + umlal v13.2D, v4.2S, v15.2S + umaddl x27, w3, w16, x0 + umlal v27.2D, v4.2S, v23.2S + umull x0, w17, w14 + umlal v27.2D, v6.2S, v11.2S + umull x4, w12, w14 + umlal v27.2D, v8.2S, v12.2S + umaddl x25, w11, w10, x20 + umlal v27.2D, v1.2S, v17.2S + umaddl x0, w28, w10, x0 + umlal v13.2D, v6.2S, v23.2S + umull x3, w17, w6 + umlal v13.2D, v8.2S, v11.2S + umaddl x1, w21, w26, x4 + umlal v20.2D, v8.2S, v16.2S + umaddl x4, w2, w13, x24 + umlal v28.2D, v3.2S, v12.2S + umaddl x20, w28, w7, x3 + umlal v29.2D, v7.2S, v11.2S + and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v29.2D, v9.2S, v12.2S + umaddl x19, w17, w22, x27 + add w27, w2, w2, lsl #1; + mul v18.2S, v24.2S, v31.2S + add w27, w27, w2, lsl #4 + umlal v21.2D, v3.2S, v23.2S + umull x24, w17, w7 + umlal v13.2D, v1.2S, v24.2S + add x19, x19, x19 + shl v29.2D, v29.2D, #1 + umaddl x1, w2, w16, x1 + umull v15.2D, v1.2S, v23.2S + umaddl x0, w27, w22, x0 + umlal v29.2D, v0.2S, v24.2S + umaddl x2, w28, w5, x24 + mul v24.2S, v23.2S, v31.2S + umaddl x4, w28, w23, x4 + umlal v21.2D, v5.2S, v11.2S + umaddl x24, w27, w5, x20 + umlal v20.2D, v1.2S, v14.2S + umaddl x20, w11, w23, x19 + umlal v26.2D, v4.2S, v12.2S + umaddl x19, w27, w23, x2 + umlal v26.2D, v6.2S, v16.2S + umaddl x2, w21, w6, x4 + umlal v29.2D, v2.2S, v17.2S + umaddl x24, w8, w23, x24 + umlal v15.2D, v3.2S, v11.2S + umaddl x0, w21, w16, x0 + umaddl x4, w21, w13, x19 + mul v23.2S, v11.2S, v31.2S + umlal v20.2D, v3.2S, v22.2S + umaddl x2, w12, w7, x2 + umlal v20.2D, v5.2S, v10.2S + umaddl x19, w12, w26, x0 + umlal v29.2D, v4.2S, v14.2S + umaddl x0, w12, w13, x24 + umlal v26.2D, v8.2S, v19.2S + umaddl x20, w15, w5, x20 + umlal v26.2D, v1.2S, v22.2S + umaddl x21, w15, w10, x9 + umlal v26.2D, v3.2S, v10.2S + and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce + umlal v29.2D, v6.2S, v22.2S + umaddl x20, w30, w7, x20 + umaddl x1, w28, w22, x1 + add x24, x19, x19 + umull v11.2D, v1.2S, v12.2S + add w19, w3, w3, lsl #1; + umlal v26.2D, v5.2S, v18.2S + add w19, w19, w3, lsl #4 + umaddl x20, w9, w6, x20 + umlal v29.2D, v8.2S, v10.2S + add w29, w9, w9, lsl #1; + umlal v13.2D, v3.2S, v17.2S + add w29, w29, w9, lsl #4 + umaddl x2, w19, w10, x2 + umlal v11.2D, v3.2S, v16.2S + umaddl x21, w30, w14, x21 + umlal v11.2D, v5.2S, v19.2S + umaddl x20, w3, w13, x20 + umlal v11.2D, v7.2S, v24.2S + umaddl x2, w29, w22, x2 + umlal v11.2D, v9.2S, v23.2S + umaddl x21, w9, w26, x21 + ushr v23.2D, v30.2D, #1 + umaddl x1, w17, w10, x1 + umlal v13.2D, v5.2S, v14.2S + umaddl x24, w19, w5, x24 + umlal v27.2D, v3.2S, v14.2S + umaddl x21, w3, w16, x21 + shl v11.2D, v11.2D, #1 + add w3, w30, w30, lsl #1; + umlal v28.2D, v5.2S, v16.2S + add w3, w3, w30, lsl #4 + umaddl x24, w29, w23, x24 + umlal v28.2D, v7.2S, v19.2S + add x1, x1, x1 + umlal v28.2D, v9.2S, v24.2S + umaddl x1, w11, w5, x1 + umlal v15.2D, v5.2S, v12.2S + umaddl x24, w30, w13, x24 + umlal v15.2D, v7.2S, v16.2S + umaddl x25, w15, w14, x25 + umlal v15.2D, v9.2S, v19.2S + umaddl x1, w15, w7, x1 + shl v28.2D, v28.2D, #1 + umaddl x24, w15, w6, x24 + umlal v21.2D, v7.2S, v12.2S + umaddl x2, w30, w16, x2 + umlal v21.2D, v9.2S, v16.2S + umaddl x25, w30, w26, x25 + shl v15.2D, v15.2D, #1 + umaddl x30, w30, w6, x1 + umlal v28.2D, v0.2S, v22.2S + umaddl x1, w15, w26, x2 + umlal v28.2D, v2.2S, v10.2S + umaddl x2, w9, w16, x25 + shl v21.2D, v21.2D, #1 + umaddl x24, w11, w7, x24 + umlal v15.2D, v0.2S, v14.2S + umaddl x1, w11, w14, x1 + umlal v21.2D, v0.2S, v17.2S + umaddl x25, w9, w13, x30 + umlal v28.2D, v4.2S, v18.2S + umaddl x0, w19, w26, x0 + umlal v25.2D, v2.2S, v12.2S + add x1, x1, x24, lsr #26 + umlal v25.2D, v4.2S, v16.2S + umaddl x30, w19, w22, x2 + umlal v21.2D, v2.2S, v14.2S + umaddl x4, w12, w6, x4 + mul v14.2S, v14.2S, v31.2S + umaddl x25, w19, w23, x25 + and x2, x1, #0x1ffffff + mul v16.2S, v17.2S, v31.2S + umlal v25.2D, v6.2S, v19.2S + umaddl x9, w19, w14, x4 + umlal v13.2D, v7.2S, v22.2S + add x25, x25, x1, lsr #25 + umlal v21.2D, v4.2S, v22.2S + umaddl x0, w29, w14, x0 + umlal v26.2D, v7.2S, v16.2S + add x30, x30, x25, lsr #26 + umlal v26.2D, v9.2S, v14.2S + add w1, w15, w15, lsl #1; + umlal v28.2D, v6.2S, v16.2S + add w1, w1, w15, lsl #4 + add x4, x20, x30, lsr #25 + umlal v28.2D, v8.2S, v14.2S + and x25, x25, #0x3ffffff + umlal v15.2D, v2.2S, v22.2S + add x21, x21, x4, lsr #26 + umlal v11.2D, v0.2S, v10.2S + bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 + umlal v11.2D, v2.2S, v18.2S + bic x30, x21, #0x3ffffff + usra v26.2D, v28.2D, #26 + lsr x20, x30, #26 + umlal v15.2D, v4.2S, v10.2S + add x20, x20, x30, lsr #25 + umlal v15.2D, v6.2S, v18.2S + umaddl x9, w29, w10, x9 + umlal v15.2D, v8.2S, v16.2S + add x30, x20, x30, lsr #22 + umlal v27.2D, v5.2S, v22.2S + umull x20, w17, w26 + umlal v20.2D, v7.2S, v18.2S + umaddl x30, w17, w16, x30 + umlal v20.2D, v9.2S, v16.2S + umaddl x17, w3, w10, x0 + usra v15.2D, v26.2D, #25 + umaddl x0, w28, w14, x20 + umlal v27.2D, v7.2S, v10.2S + umaddl x20, w28, w26, x30 + umlal v27.2D, v9.2S, v18.2S + add w28, w12, w12, lsl #1; + usra v20.2D, v15.2D, #26 + add w28, w28, w12, lsl #4 + umaddl x30, w27, w10, x0 + and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + umaddl x27, w27, w14, x20 + umaddl x0, w8, w10, x27 + mul v12.2S, v22.2S, v31.2S + and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + umaddl x14, w3, w22, x9 + umlal v21.2D, v6.2S, v10.2S + umaddl x27, w8, w22, x30 + trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + umaddl x10, w28, w22, x0 + umlal v11.2D, v4.2S, v16.2S + umaddl x30, w15, w16, x14 + and v26.16B, v26.16B, v23.16B + umaddl x28, w12, w16, x27 + umlal v21.2D, v8.2S, v18.2S + add x10, x10, x10 + umlal v25.2D, v8.2S, v24.2S + umaddl x20, w19, w6, x10 + umlal v25.2D, v1.2S, v10.2S + add x28, x28, x28 + umlal v25.2D, v3.2S, v18.2S + umaddl x28, w19, w7, x28 + usra v21.2D, v20.2D, #25 + umaddl x0, w29, w7, x20 + umlal v11.2D, v6.2S, v14.2S + umaddl x10, w11, w26, x30 + umlal v13.2D, v9.2S, v10.2S + umaddl x19, w29, w5, x28 + usra v27.2D, v21.2D, #26 + umaddl x0, w3, w5, x0 + umlal v25.2D, v5.2S, v16.2S + umaddl x20, w1, w22, x17 + and v20.16B, v28.16B, v30.16B + umaddl x29, w3, w23, x19 + usra v29.2D, v27.2D, #25 + umaddl x3, w1, w23, x0 + and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2D, v8.2S, v12.2S + umaddl x12, w15, w13, x29 + usra v13.2D, v29.2D, #26 + umaddl x7, w11, w13, x3 + trn1 v6.4S, v6.4S, v7.4S + umaddl x17, w11, w16, x20 + umlal v25.2D, v7.2S, v14.2S + and x23, x4, #0x3ffffff + bic v19.16B, v13.16B, v23.16B + umaddl x19, w11, w6, x12 + and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + add x3, x17, x7, lsr #26 + usra v11.2D, v19.2D, #25 + trn1 v2.4S, v2.4S, v3.4S + add x17, x19, x3, lsr #25 + and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and x5, x7, #0x3ffffff + usra v11.2D, v19.2D, #24 + add x7, x10, x17, lsr #26 + trn1 v0.4S, v0.4S, v1.4S + and x19, x24, #0x3ffffff + and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + add x29, x19, x7, lsr #25 + usra v11.2D, v19.2D, #21 + bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 + trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + add x19, x2, x29, lsr #26 + trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + and x3, x29, #0x3ffffff + mov v16.d[0], v6.d[1] // FINAL x3 + mov v6.d[0], v17.d[1] // FINAL x2 + trn1 v8.4S, v8.4S, v9.4S + bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 + and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 + mov v18.d[0], v8.d[1] // FINAL x3 + mov v8.d[0], v19.d[1] // FINAL x2 + umlal v25.2D, v9.2S, v12.2S + mov v9.d[0], x23 // FINAL z2 + mov v7.d[0], x25 // FINAL z2 + ldr d29, [mask1] + mov v12.d[0], v2.d[1] // FINAL x3 + trn1 v4.4S, v4.4S, v5.4S + and x17, x17, #0x3ffffff + usra v25.2D, v11.2D, #26 + mov v10.d[0], v0.d[1] // FINAL x3 + mov v14.d[0], v4.d[1] // FINAL x3 + mov v4.d[0], v15.d[1] // FINAL x2 + usra v20.2D, v25.2D, #25 + and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 + mov v5.d[0], x3 // depth 86 + mov v1.d[0], x5 // FINAL z2 + usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4S, v21.4S, v27.4S // FINAL z3 + trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + mov v0.d[0], v11.d[1] // FINAL x2 + mov v3.d[0], x17 // FINAL z2 + mov v2.d[0], v13.d[1] // FINAL x2 + ldr d28, [mask2] + + ldr x0, [i] + subs x0, x0, #1 + str x0, [i] bcs curve25519_x25519_byte_scalarloop -// Multiplex directly into (xn,zn) then do three pure doubling steps; -// this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. - - cmp swap, xzr - mux_4(xn,xm,xn) - mux_4(zn,zm,zn) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_p25519(zn,p,e) - -// The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn - - add x0, xm +// Repack X2 into the saturated representation as 256-bit value xn. +// This does not fully normalize mod 2^255-19 but stays within 256 bits. + + mov w0, v0.s[0] + mov w1, v0.s[1] + mov w2, v2.s[0] + mov w3, v2.s[1] + mov w4, v4.s[0] + mov w5, v4.s[1] + mov w6, v6.s[0] + mov w7, v6.s[1] + mov w8, v8.s[0] + mov w9, v8.s[1] + + add x0, x0, x1, lsl #26 + add x1, x2, x3, lsl #26 + add x2, x4, x5, lsl #26 + add x3, x6, x7, lsl #26 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [xn] + stp x2, x3, [xn+16] + +// Repack Z2 into the saturated representation as 256-bit value zn. +// This does not fully normalize mod 2^255-19. However since Z2, +// unlike X2, was not repacked (within the last multiplication) in +// right-to-left order, its top digit can be any 26-bit value, on +// the face of it. To make sure we don't overflow 256 bits here +// we remove b = 25th bit of the 9th digit (now scaled by 2^230 +// giving bit 25 a final weighting of 2^255) and add 19 * b to +// to the bottom of the sum here to compensate mod 2^255-19. + + mov w0, v1.s[0] + mov w1, v1.s[1] + mov w2, v3.s[0] + mov w3, v3.s[1] + mov w4, v5.s[0] + mov w5, v5.s[1] + mov w6, v7.s[0] + mov w7, v7.s[1] + mov w8, v9.s[0] + mov w9, v9.s[1] + + mov w10, #19 + add x0, x0, x1, lsl #26 + tst x9, #0x2000000 + add x1, x2, x3, lsl #26 + csel x10, x10, xzr, ne + add x2, x4, x5, lsl #26 + and x9, x9, #0x1FFFFFF + add x3, x6, x7, lsl #26 + add x0, x0, x10 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [zn] + stp x2, x3, [zn+16] + +// Because the lowest bit (indeed, the three lowest bits) of the scalar +// were forced to zero, we know that the projective result of the scalar +// multiplication was in (X2,Z2) and is now (xn,zn) in saturated form. +// Prepare to call the modular inverse function to get zn' = 1/zn. + + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -978,7 +1466,7 @@ curve25519_x25519_byte_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn, and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -2009,102 +2497,278 @@ curve25519_x25519_byte_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. The multiplication below is just an inlined +// version of bignum_mul_p25519 except for the detailed +// addressing of inputs and outputs + + ldp x3, x4, [xn] + ldp x5, x6, [zn] + umull x7, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x8, w16, w0 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [xn+16] + ldp x5, x6, [zn+16] + umull x11, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x12, w16, w0 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [xn+16] + ldp x15, x16, [xn] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, cc + ldp x15, x0, [zn] + subs x5, x15, x5 + sbcs x6, x0, x6 + csetm x0, cc + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x0 + subs x5, x5, x0 + eor x6, x6, x0 + sbc x6, x6, x0 + eor x16, x0, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x0, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x9, cc + adds x15, x15, x0 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, cc + cinv x9, x9, cc + mul x5, x4, x6 + umulh x6, x4, x6 + adds x0, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #0x1 + eor x5, x5, x9 + adcs x0, x5, x0 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #0x1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x0, x0, x16 + adcs x10, x0, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #0x26 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x0, x14, #31 + mov x5, #0x13 + umaddl x5, w5, w0, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x0, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #0x13 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [zn] + stp x9, x10, [zn+16] + +// Now copy bytewise to the output - mul_p25519(zn,xn,xm) + ldr x17, [res] ldp x10, x11, [zn] - strb w10, [resx] + strb w10, [x17] lsr x10, x10, #8 - strb w10, [resx+1] + strb w10, [x17, #1] lsr x10, x10, #8 - strb w10, [resx+2] + strb w10, [x17, #2] lsr x10, x10, #8 - strb w10, [resx+3] + strb w10, [x17, #3] lsr x10, x10, #8 - strb w10, [resx+4] + strb w10, [x17, #4] lsr x10, x10, #8 - strb w10, [resx+5] + strb w10, [x17, #5] lsr x10, x10, #8 - strb w10, [resx+6] + strb w10, [x17, #6] lsr x10, x10, #8 - strb w10, [resx+7] + strb w10, [x17, #7] - strb w11, [resx+8] + strb w11, [x17, #8] lsr x11, x11, #8 - strb w11, [resx+9] + strb w11, [x17, #9] lsr x11, x11, #8 - strb w11, [resx+10] + strb w11, [x17, #10] lsr x11, x11, #8 - strb w11, [resx+11] + strb w11, [x17, #11] lsr x11, x11, #8 - strb w11, [resx+12] + strb w11, [x17, #12] lsr x11, x11, #8 - strb w11, [resx+13] + strb w11, [x17, #13] lsr x11, x11, #8 - strb w11, [resx+14] + strb w11, [x17, #14] lsr x11, x11, #8 - strb w11, [resx+15] + strb w11, [x17, #15] ldp x12, x13, [zn+16] - strb w12, [resx+16] + strb w12, [x17, #16] lsr x12, x12, #8 - strb w12, [resx+17] + strb w12, [x17, #17] lsr x12, x12, #8 - strb w12, [resx+18] + strb w12, [x17, #18] lsr x12, x12, #8 - strb w12, [resx+19] + strb w12, [x17, #19] lsr x12, x12, #8 - strb w12, [resx+20] + strb w12, [x17, #20] lsr x12, x12, #8 - strb w12, [resx+21] + strb w12, [x17, #21] lsr x12, x12, #8 - strb w12, [resx+22] + strb w12, [x17, #22] lsr x12, x12, #8 - strb w12, [resx+23] + strb w12, [x17, #23] - strb w13, [resx+24] + strb w13, [x17, #24] lsr x13, x13, #8 - strb w13, [resx+25] + strb w13, [x17, #25] lsr x13, x13, #8 - strb w13, [resx+26] + strb w13, [x17, #26] lsr x13, x13, #8 - strb w13, [resx+27] + strb w13, [x17, #27] lsr x13, x13, #8 - strb w13, [resx+28] + strb w13, [x17, #28] lsr x13, x13, #8 - strb w13, [resx+29] + strb w13, [x17, #29] lsr x13, x13, #8 - strb w13, [resx+30] + strb w13, [x17, #30] lsr x13, x13, #8 - strb w13, [resx+31] - -// Restore stack and registers - - add sp, sp, #NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - + strb w13, [x17, #31] + +// Restore stack and registers (this will zero the tops of Q8...Q15). + + ldp d8, d9, [regsave+0] + ldp d10, d11, [regsave+16] + ldp d12, d13, [regsave+32] + ldp d14, d15, [regsave+48] + ldp x19, x20, [regsave+64] + ldp x21, x22, [regsave+80] + ldp x23, x24, [regsave+96] + ldp x25, x26, [regsave+112] + ldp x27, x28, [regsave+128] + ldp x29, x30, [regsave+144] + add sp, sp, #NSPACE+160 ret #if defined(__linux__) && defined(__ELF__) From 81a5d6047936a4c87fed3318e271d6e811d0f2e5 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 14 Feb 2024 08:41:59 -0800 Subject: [PATCH 12/24] Tweak attribution of SLOTHY work s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/f82da8fd8015d2a0b590360edb9afb3c842cfea6 --- arm/curve25519/curve25519_x25519.S | 8 ++++---- arm/curve25519/curve25519_x25519_byte.S | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 28dd2f696a..eeefa69b0c 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -7,7 +7,8 @@ // https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf // https://github.com/Emill/X25519-AArch64 // -// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// and the SLOTHY-based re-engineering of that code by Abdulrahman, Becker, +// Kannwischer and Klein: // // https://eprint.iacr.org/2022/1303.pdf // https://github.com/slothy-optimizer/slothy/tree/main/paper @@ -202,9 +203,8 @@ curve25519_x25519_scalarloop: // In particular, the basic dataflow and the organization between integer // and SIMD units is identical, with only a few minor changes to some // individual instructions (for miscellaneous reasons). The scheduling -// was redone from scratch by SLOTHY starting from Hanno Becker's -// un-interleaved form and using the same scripts as in Becker et al's -// paper. +// was redone from scratch by SLOTHY starting from the un-interleaved +// form in the SLOTHY work cited above, and using the same scripts. // // The intermediate value annotations were added to provide data that // is used in the formal proof, indicating which lines assign specific diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index e6c891284d..89f2f44f4e 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -7,7 +7,8 @@ // https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf // https://github.com/Emill/X25519-AArch64 // -// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// and the SLOTHY-based re-engineering of that code by Abdulrahman, Becker, +// Kannwischer and Klein: // // https://eprint.iacr.org/2022/1303.pdf // https://github.com/slothy-optimizer/slothy/tree/main/paper @@ -320,9 +321,8 @@ curve25519_x25519_byte_scalarloop: // In particular, the basic dataflow and the organization between integer // and SIMD units is identical, with only a few minor changes to some // individual instructions (for miscellaneous reasons). The scheduling -// was redone from scratch by SLOTHY starting from Hanno Becker's -// un-interleaved form and using the same scripts as in Becker et al's -// paper. +// was redone from scratch by SLOTHY starting from the un-interleaved +// form in the SLOTHY work cited above, and using the same scripts. // // The intermediate value annotations were added to provide data that // is used in the formal proof, indicating which lines assign specific From bf905783c336889958f4a20cdf71c942d5ee242f Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 21 Feb 2024 17:35:19 -0800 Subject: [PATCH 13/24] Tidy up Montgomery ladders This simplifies the remaining Montgomery ladder implementations by avoiding the special code for zero handling, since it is not actually necessary given the behavior of the modular inverse in this case. In addition, the proofs have been tidied up a bit, factoring out the basic mathematics so that the loop invariant becomes simpler. The update also fixes a README typo pointed out by Dan Bernstein and removes a couple of stray comments arising from SLOTHY output in the Lenngren-derived X25519 code. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/e14394d73ed171c13da61f948c6b5f4b91806263 --- arm/curve25519/curve25519_x25519.S | 2 +- arm/curve25519/curve25519_x25519_alt.S | 36 ++++++------------- arm/curve25519/curve25519_x25519_byte.S | 2 +- arm/curve25519/curve25519_x25519_byte_alt.S | 36 ++++++------------- x86_att/curve25519/curve25519_x25519.S | 38 ++++++--------------- x86_att/curve25519/curve25519_x25519_alt.S | 38 ++++++--------------- 6 files changed, 42 insertions(+), 110 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index eeefa69b0c..c742ad9d38 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -1242,7 +1242,7 @@ curve25519_x25519_scalarloop: usra v20.2D, v25.2D, #25 and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 - mov v5.d[0], x3 // depth 86 + mov v5.d[0], x3 mov v1.d[0], x5 // FINAL z2 usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 diff --git a/arm/curve25519/curve25519_x25519_alt.S b/arm/curve25519/curve25519_x25519_alt.S index 82de375b14..518cb89555 100644 --- a/arm/curve25519/curve25519_x25519_alt.S +++ b/arm/curve25519/curve25519_x25519_alt.S @@ -593,8 +593,7 @@ curve25519_x25519_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. cmp swap, xzr mux_4(xn,xm,xn) @@ -631,12 +630,12 @@ curve25519_x25519_alt_scalarloop: orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - add x0, xm + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -644,7 +643,7 @@ curve25519_x25519_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1675,28 +1674,13 @@ curve25519_x25519_alt_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 89f2f44f4e..c2e89a26c0 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -1360,7 +1360,7 @@ curve25519_x25519_byte_scalarloop: usra v20.2D, v25.2D, #25 and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 - mov v5.d[0], x3 // depth 86 + mov v5.d[0], x3 mov v1.d[0], x5 // FINAL z2 usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 diff --git a/arm/curve25519/curve25519_x25519_byte_alt.S b/arm/curve25519/curve25519_x25519_byte_alt.S index 790cb2b030..511e2960bd 100644 --- a/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/arm/curve25519/curve25519_x25519_byte_alt.S @@ -711,8 +711,7 @@ curve25519_x25519_byte_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. cmp swap, xzr mux_4(xn,xm,xn) @@ -749,12 +748,12 @@ curve25519_x25519_byte_alt_scalarloop: orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - add x0, xm + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -762,7 +761,7 @@ curve25519_x25519_byte_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1793,28 +1792,13 @@ curve25519_x25519_byte_alt_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. - mul_p25519(zn,xn,xm) + mul_p25519(zn,xn,zn) ldp x10, x11, [zn] strb w10, [resx] diff --git a/x86_att/curve25519/curve25519_x25519.S b/x86_att/curve25519/curve25519_x25519.S index 87e5e9cf62..b9f7cdaa16 100644 --- a/x86_att/curve25519/curve25519_x25519.S +++ b/x86_att/curve25519/curve25519_x25519.S @@ -755,8 +755,7 @@ curve25519_x25519_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. movq swap, %rdx testq %rdx, %rdx @@ -788,12 +787,12 @@ curve25519_x25519_scalarloop: sub_twice4(p,s,d) cmadd_4(e,0x1db42,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - leaq 256(%rsp), %rdi + leaq 224(%rsp), %rdi leaq 224(%rsp), %rsi // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -801,7 +800,7 @@ curve25519_x25519_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 208 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. movq %rdi, 0xc0(%rsp) xorl %eax, %eax @@ -2149,31 +2148,14 @@ curve25519_x25519_midloop: movq %r14, 0x10(%rdi) movq %r15, 0x18(%rdi) -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - movq 224(%rsp), %rax - orq 232(%rsp), %rax - orq 240(%rsp), %rax - orq 248(%rsp), %rax - movq 320(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 320(%rsp) - movq 328(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 328(%rsp) - movq 336(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 336(%rsp) - movq 344(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 344(%rsp) - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. movq res, %rbp - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers diff --git a/x86_att/curve25519/curve25519_x25519_alt.S b/x86_att/curve25519/curve25519_x25519_alt.S index 4a63a55f11..f7c6c3d7b0 100644 --- a/x86_att/curve25519/curve25519_x25519_alt.S +++ b/x86_att/curve25519/curve25519_x25519_alt.S @@ -916,8 +916,7 @@ curve25519_x25519_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. movq swap, %rdx testq %rdx, %rdx @@ -949,12 +948,12 @@ curve25519_x25519_alt_scalarloop: sub_twice4(p,s,d) cmadd_4(e,0x1db42,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - leaq 256(%rsp), %rdi + leaq 224(%rsp), %rdi leaq 224(%rsp), %rsi // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -962,7 +961,7 @@ curve25519_x25519_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 208 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. movq %rdi, 0xc0(%rsp) xorl %eax, %eax @@ -2310,31 +2309,14 @@ curve25519_x25519_alt_midloop: movq %r14, 0x10(%rdi) movq %r15, 0x18(%rdi) -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - movq 224(%rsp), %rax - orq 232(%rsp), %rax - orq 240(%rsp), %rax - orq 248(%rsp), %rax - movq 320(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 320(%rsp) - movq 328(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 328(%rsp) - movq 336(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 336(%rsp) - movq 344(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 344(%rsp) - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. movq res, %rbp - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers From 1f1c0ffc5aee07f0d5a5f1dc66473e78e389f8a8 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Tue, 5 Mar 2024 19:48:41 +0000 Subject: [PATCH 14/24] Update curve25519_x25519{_byte} to make AWS-LC's delocator work This patch performs a few syntactic updates to make AWS-LC's delocator work. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/3b4f73ceed74790d7f005257a24b8f64a85e22bb --- arm/curve25519/curve25519_x25519.S | 928 ++++++++++++------------ arm/curve25519/curve25519_x25519_byte.S | 928 ++++++++++++------------ 2 files changed, 928 insertions(+), 928 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index eeefa69b0c..2a26dafc8f 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -166,12 +166,12 @@ S2N_BN_SYMBOL(curve25519_x25519): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #(1<<26)-1 + mov x0, #67108863 // #(1<<26)-1 mov v30.d[0], x0 mov v30.d[1], x0 mov x0, #0x07fffffe07fffffe - sub x1, x0, #0xfe-0xb4 + sub x1, x0, #74 // #0xfe-0xb4 sub x0, x0, #2 stp x0, x1, [mask1] @@ -241,35 +241,35 @@ curve25519_x25519_scalarloop: // (x2',z2') = (x4,z4) // (x3',z3') = (x5,z5) - add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a - sub v21.2S, v28.2S, v1.2S - add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a - sub v24.2S, v29.2S, v3.2S - add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c - add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b - sub v20.2S, v29.2S, v15.2S - sub v1.2S, v29.2S, v5.2S - sub v26.2S, v28.2S, v11.2S - sub v21.2S, v29.2S, v19.2S - add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c - add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d - add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d - sub v20.2S, v29.2S, v17.2S - add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b - add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c - add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c - add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d - add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c - add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d - sub v10.2S, v29.2S, v13.2S - sub v13.2S, v29.2S, v7.2S - add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a - sub v7.2S, v29.2S, v9.2S - add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + add v22.2s, v2.2s, v3.2s // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2s, v28.2s, v1.2s + add v25.2s, v0.2s, v1.2s // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2s, v29.2s, v3.2s + add v3.2s, v18.2s, v19.2s // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2s, v0.2s, v21.2s // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2s, v29.2s, v15.2s + sub v1.2s, v29.2s, v5.2s + sub v26.2s, v28.2s, v11.2s + sub v21.2s, v29.2s, v19.2s + add v19.2s, v10.2s, v11.2s // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2s, v14.2s, v20.2s // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2s, v18.2s, v21.2s // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2s, v29.2s, v17.2s + add v18.2s, v2.2s, v24.2s // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2s, v14.2s, v15.2s // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2s, v16.2s, v17.2s // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2s, v16.2s, v20.2s // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2s, v12.2s, v13.2s // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2s, v10.2s, v26.2s // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2s, v29.2s, v13.2s + sub v13.2s, v29.2s, v7.2s + add v23.2s, v6.2s, v7.2s // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2s, v29.2s, v9.2s + add v27.2s, v12.2s, v10.2s // ubignum_of_qreglist 1 // INTERMEDIATE d fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f - add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + add v28.2s, v4.2s, v5.2s // ubignum_of_qreglist 2 // INTERMEDIATE a fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f - add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + add v7.2s, v8.2s, v7.2s // ubignum_of_qreglist 4 // INTERMEDIATE b fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f mov x0, v20.d[0] fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f @@ -279,589 +279,589 @@ curve25519_x25519_scalarloop: lsr x26, x0, #32 add x29, x21, x21 umull x15, w5, w29 - add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add v13.2s, v6.2s, v13.2s // ubignum_of_qreglist 3 // INTERMEDIATE b add x12, x26, x26 mov x30, v5.d[0] fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g lsr x11, x5, #32 lsr x10, x30, #32 - trn2 v20.2S, v21.2S, v3.2S - add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + trn2 v20.2s, v21.2s, v3.2s + add v9.2s, v8.2s, v9.2s // ubignum_of_qreglist 4 // INTERMEDIATE a add x14, x11, x11 - trn2 v6.2S, v2.2S, v15.2S - trn1 v12.2S, v25.2S, v0.2S - add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b - trn1 v16.2S, v23.2S, v13.2S + trn2 v6.2s, v2.2s, v15.2s + trn1 v12.2s, v25.2s, v0.2s + add v1.2s, v4.2s, v1.2s // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2s, v23.2s, v13.2s fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g - trn2 v17.2S, v27.2S, v24.2S + trn2 v17.2s, v27.2s, v24.2s str d29, [tmpb+32] add x17, x10, x10 - trn2 v4.2S, v28.2S, v1.2S - trn1 v5.2S, v28.2S, v1.2S - trn1 v28.2S, v2.2S, v15.2S - trn1 v2.2S, v22.2S, v18.2S + trn2 v4.2s, v28.2s, v1.2s + trn1 v5.2s, v28.2s, v1.2s + trn1 v28.2s, v2.2s, v15.2s + trn1 v2.2s, v22.2s, v18.2s fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g - trn2 v15.2S, v22.2S, v18.2S - umull v22.2D, v12.2S, v20.2S + trn2 v15.2s, v22.2s, v18.2s + umull v22.2d, v12.2s, v20.2s umull x22, w30, w17 stp d29, d10, [tmpb+0] - trn2 v10.2S, v23.2S, v13.2S - trn2 v23.2S, v11.2S, v14.2S - trn1 v13.2S, v27.2S, v24.2S + trn2 v10.2s, v23.2s, v13.2s + trn2 v23.2s, v11.2s, v14.2s + trn1 v13.2s, v27.2s, v24.2s fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g - trn1 v14.2S, v11.2S, v14.2S - umlal v22.2D, v2.2S, v6.2S + trn1 v14.2s, v11.2s, v14.2s + umlal v22.2d, v2.2s, v6.2s umull x25, w30, w30 - umlal v22.2D, v5.2S, v23.2S + umlal v22.2d, v5.2s, v23.2s add x3, x30, x30 - umlal v22.2D, v16.2S, v17.2S + umlal v22.2d, v16.2s, v17.2s add w30, w21, w21, lsl #1; stp d27, d8, [tmpb+16] add w30, w30, w21, lsl #4 - trn1 v11.2S, v26.2S, v19.2S - trn2 v8.2S, v26.2S, v19.2S - trn2 v19.2S, v25.2S, v0.2S - mul v29.2S, v20.2S, v31.2S + trn1 v11.2s, v26.2s, v19.2s + trn2 v8.2s, v26.2s, v19.2s + trn2 v19.2s, v25.2s, v0.2s + mul v29.2s, v20.2s, v31.2s ldr x20, [tmpb+24] - umull v25.2D, v19.2S, v6.2S + umull v25.2d, v19.2s, v6.2s add x1, x0, x0 - umull v27.2D, v19.2S, v23.2S + umull v27.2d, v19.2s, v23.2s umull x9, w5, w1 - umull v0.2D, v12.2S, v23.2S + umull v0.2d, v12.2s, v23.2s lsr x24, x20, #32 - mul v20.2S, v23.2S, v31.2S + mul v20.2s, v23.2s, v31.2s lsr x16, x21, #32 - umlal v25.2D, v15.2S, v23.2S + umlal v25.2d, v15.2s, v23.2s umaddl x13, w11, w14, x9 - umlal v25.2D, v4.2S, v17.2S + umlal v25.2d, v4.2s, v17.2s umaddl x9, w14, w17, x15 - umull v24.2D, v12.2S, v6.2S + umull v24.2d, v12.2s, v6.2s add w2, w16, w16, lsl #1; fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f add w2, w2, w16, lsl #4 - trn1 v18.2S, v21.2S, v3.2S - umull v3.2D, v19.2S, v29.2S + trn1 v18.2s, v21.2s, v3.2s + umull v3.2d, v19.2s, v29.2s umull x28, w5, w3 - mul v1.2S, v6.2S, v31.2S + mul v1.2s, v6.2s, v31.2s umull x8, w5, w5 - umlal v24.2D, v2.2S, v23.2S + umlal v24.2d, v2.2s, v23.2s umaddl x13, w21, w30, x13 - mul v23.2S, v17.2S, v31.2S + mul v23.2s, v17.2s, v31.2s umaddl x27, w14, w12, x28 - trn2 v6.2S, v9.2S, v7.2S + trn2 v6.2s, v9.2s, v7.2s mov x6, v26.d[0] - umlal v3.2D, v15.2S, v1.2S + umlal v3.2d, v15.2s, v1.2s add x16, x16, x16 - umlal v3.2D, v4.2S, v20.2S + umlal v3.2d, v4.2s, v20.2s lsr x4, x6, #32 - umlal v3.2D, v10.2S, v23.2S + umlal v3.2d, v10.2s, v23.2s add x7, x6, x6 - umull v26.2D, v19.2S, v8.2S + umull v26.2d, v19.2s, v8.2s add x23, x4, x4 umaddl x28, w5, w23, x22 - trn1 v7.2S, v9.2S, v7.2S - umlal v27.2D, v15.2S, v17.2S + trn1 v7.2s, v9.2s, v7.2s + umlal v27.2d, v15.2s, v17.2s add w15, w4, w4, lsl #1; - umlal v27.2D, v4.2S, v8.2S + umlal v27.2d, v4.2s, v8.2s add w15, w15, w4, lsl #4 add w22, w10, w10, lsl #1; - umlal v24.2D, v5.2S, v17.2S + umlal v24.2d, v5.2s, v17.2s add w22, w22, w10, lsl #4 umaddl x10, w11, w7, x28 - umlal v25.2D, v10.2S, v8.2S + umlal v25.2d, v10.2s, v8.2s umull x21, w5, w16 - umlal v25.2D, v6.2S, v29.2S + umlal v25.2d, v6.2s, v29.2s umaddl x23, w15, w23, x25 - umlal v27.2D, v10.2S, v29.2S + umlal v27.2d, v10.2s, v29.2s umull x19, w5, w12 - umlal v27.2D, v6.2S, v1.2S + umlal v27.2d, v6.2s, v1.2s umaddl x25, w11, w29, x21 - umlal v0.2D, v2.2S, v17.2S + umlal v0.2d, v2.2s, v17.2s umaddl x28, w0, w3, x9 - shl v21.2D, v25.2D, #1 + shl v21.2d, v25.2d, #1 umaddl x4, w11, w1, x19 umaddl x21, w2, w29, x4 - mul v25.2S, v8.2S, v31.2S - umlal v24.2D, v16.2S, v8.2S + mul v25.2s, v8.2s, v31.2s + umlal v24.2d, v16.2s, v8.2s umaddl x19, w0, w17, x25 - umlal v24.2D, v7.2S, v29.2S + umlal v24.2d, v7.2s, v29.2s umull x25, w5, w17 - umlal v24.2D, v19.2S, v28.2S + umlal v24.2d, v19.2s, v28.2s umaddl x4, w0, w16, x10 - umull v9.2D, v12.2S, v8.2S + umull v9.2d, v12.2s, v8.2s umaddl x23, w5, w7, x23 - umlal v21.2D, v12.2S, v18.2S + umlal v21.2d, v12.2s, v18.2s add w10, w6, w6, lsl #1; - shl v27.2D, v27.2D, #1 + shl v27.2d, v27.2d, #1 add w10, w10, w6, lsl #4 umaddl x28, w26, w12, x28 - umlal v26.2D, v15.2S, v29.2S + umlal v26.2d, v15.2s, v29.2s umaddl x9, w14, w16, x23 - umlal v9.2D, v2.2S, v29.2S + umlal v9.2d, v2.2s, v29.2s umaddl x22, w22, w17, x8 - umlal v21.2D, v2.2S, v28.2S + umlal v21.2d, v2.2s, v28.2s umaddl x28, w6, w10, x28 umaddl x27, w0, w0, x27 add x8, x14, x14 - umlal v0.2D, v5.2S, v8.2S + umlal v0.2d, v5.2s, v8.2s umull x5, w5, w14 - umlal v9.2D, v5.2S, v1.2S + umlal v9.2d, v5.2s, v1.2s umaddl x14, w0, w29, x9 - umlal v26.2D, v4.2S, v1.2S + umlal v26.2d, v4.2s, v1.2s umaddl x6, w2, w16, x27 - umlal v22.2D, v7.2S, v8.2S + umlal v22.2d, v7.2s, v8.2s umaddl x5, w30, w17, x5 umaddl x5, w2, w3, x5 add x23, x17, x17 - umlal v27.2D, v12.2S, v28.2S + umlal v27.2d, v12.2s, v28.2s umaddl x13, w2, w23, x13 - umlal v26.2D, v10.2S, v20.2S + umlal v26.2d, v10.2s, v20.2s add x9, x12, x12 - umlal v9.2D, v16.2S, v20.2S + umlal v9.2d, v16.2s, v20.2s umaddl x27, w10, w29, x6 - umlal v0.2D, v16.2S, v29.2S + umlal v0.2d, v16.2s, v29.2s umaddl x6, w11, w3, x25 - umlal v22.2D, v19.2S, v18.2S + umlal v22.2d, v19.2s, v18.2s umaddl x19, w26, w3, x19 - mul v18.2S, v18.2S, v31.2S + mul v18.2s, v18.2s, v31.2s umaddl x23, w15, w23, x27 - umlal v3.2D, v6.2S, v25.2S + umlal v3.2d, v6.2s, v25.2s umaddl x0, w0, w12, x6 - umlal v0.2D, v7.2S, v1.2S + umlal v0.2d, v7.2s, v1.2s add x11, x16, x16 - umlal v9.2D, v7.2S, v23.2S + umlal v9.2d, v7.2s, v23.2s umaddl x6, w12, w17, x14 - umlal v9.2D, v19.2S, v11.2S + umlal v9.2d, v19.2s, v11.2s umaddl x25, w26, w29, x4 - umlal v9.2D, v15.2S, v18.2S + umlal v9.2d, v15.2s, v18.2s umaddl x14, w10, w3, x13 - umull v25.2D, v12.2S, v17.2S + umull v25.2d, v12.2s, v17.2s umaddl x27, w10, w16, x0 - umlal v26.2D, v6.2S, v23.2S + umlal v26.2d, v6.2s, v23.2s add x0, x25, x6, lsr #26 - mul v23.2S, v28.2S, v31.2S + mul v23.2s, v28.2s, v31.2s umaddl x12, w10, w12, x5 - shl v3.2D, v3.2D, #1 + shl v3.2d, v3.2d, #1 add x16, x22, x0, lsr #25 - umlal v21.2D, v5.2S, v14.2S + umlal v21.2d, v5.2s, v14.2s bic x22, x0, #0x1ffffff - umlal v3.2D, v12.2S, v11.2S + umlal v3.2d, v12.2s, v11.2s add x26, x16, x22, lsr #24 - umlal v3.2D, v2.2S, v18.2S + umlal v3.2d, v2.2s, v18.2s umaddl x16, w10, w17, x21 - umlal v3.2D, v5.2S, v23.2S + umlal v3.2d, v5.2s, v23.2s add x22, x26, x22, lsr #21 - umlal v9.2D, v4.2S, v23.2S + umlal v9.2d, v4.2s, v23.2s umaddl x5, w15, w29, x27 - umull v17.2D, v19.2S, v17.2S + umull v17.2d, v19.2s, v17.2s umaddl x17, w30, w3, x22 - umlal v25.2D, v2.2S, v8.2S + umlal v25.2d, v2.2s, v8.2s umaddl x25, w15, w3, x16 - umlal v25.2D, v5.2S, v29.2S + umlal v25.2d, v5.2s, v29.2s umaddl x26, w15, w7, x19 - umlal v0.2D, v19.2S, v14.2S + umlal v0.2d, v19.2s, v14.2s umaddl x17, w2, w9, x17 - umlal v17.2D, v15.2S, v8.2S + umlal v17.2d, v15.2s, v8.2s ldr x19, [tmpb+0] - umlal v17.2D, v4.2S, v29.2S + umlal v17.2d, v4.2s, v29.2s ldr x7, [tmpb+8] - shl v29.2D, v26.2D, #1 + shl v29.2d, v26.2d, #1 umaddl x13, w10, w1, x17 - umlal v0.2D, v15.2S, v13.2S + umlal v0.2d, v15.2s, v13.2s lsr x2, x19, #32 - umlal v29.2D, v12.2S, v13.2S + umlal v29.2d, v12.2s, v13.2s umaddl x27, w15, w1, x12 - umlal v29.2D, v2.2S, v11.2S + umlal v29.2d, v2.2s, v11.2s umaddl x30, w15, w8, x13 - umlal v29.2D, v5.2S, v18.2S + umlal v29.2d, v5.2s, v18.2s add x4, x7, x7 - umlal v29.2D, v16.2S, v23.2S + umlal v29.2d, v16.2s, v23.2s umaddl x29, w15, w9, x14 - umlal v0.2D, v4.2S, v11.2S + umlal v0.2d, v4.2s, v11.2s add x17, x27, x30, lsr #26 - umlal v0.2D, v10.2S, v18.2S + umlal v0.2d, v10.2s, v18.2s umaddl x16, w15, w11, x28 - umlal v0.2D, v6.2S, v23.2S + umlal v0.2d, v6.2s, v23.2s add x1, x29, x17, lsr #25 - umlal v25.2D, v16.2S, v1.2S + umlal v25.2d, v16.2s, v1.2s umull x11, w19, w4 ldr x8, [tmpb+32] - mul v26.2S, v14.2S, v31.2S - umlal v17.2D, v10.2S, v1.2S + mul v26.2s, v14.2s, v31.2s + umlal v17.2d, v10.2s, v1.2s ldr x15, [tmpb+16] - umlal v17.2D, v6.2S, v20.2S + umlal v17.2d, v6.2s, v20.2s and x9, x30, #0x3ffffff bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa add x17, x2, x2 lsr x10, x15, #32 add x27, x25, x1, lsr #26 - umlal v25.2D, v7.2S, v20.2S + umlal v25.2d, v7.2s, v20.2s add x13, x10, x10 - umlal v25.2D, v19.2S, v13.2S + umlal v25.2d, v19.2s, v13.2s add x29, x23, x27, lsr #25 - umlal v25.2D, v15.2S, v11.2S + umlal v25.2d, v15.2s, v11.2s lsr x30, x8, #32 - umlal v25.2D, v4.2S, v18.2S + umlal v25.2d, v4.2s, v18.2s add x23, x5, x29, lsr #26 - umlal v25.2D, v10.2S, v23.2S + umlal v25.2d, v10.2s, v23.2s and x14, x29, #0x3ffffff - umlal v25.2D, v6.2S, v26.2S + umlal v25.2d, v6.2s, v26.2s add x5, x16, x23, lsr #25 - shl v8.2D, v17.2D, #1 + shl v8.2d, v17.2d, #1 umaddl x12, w2, w17, x11 and x29, x5, #0x3ffffff umull x21, w19, w19 - umlal v29.2D, v7.2S, v26.2S + umlal v29.2d, v7.2s, v26.2s add w16, w10, w10, lsl #1; - umlal v3.2D, v16.2S, v26.2S + umlal v3.2d, v16.2s, v26.2s add w16, w16, w10, lsl #4 bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa add w10, w24, w24, lsl #1; add x22, x26, x5, lsr #26 add w10, w10, w24, lsl #4 - umlal v8.2D, v12.2S, v14.2S + umlal v8.2d, v12.2s, v14.2s umaddl x25, w16, w13, x21 - umlal v8.2D, v2.2S, v13.2S + umlal v8.2d, v2.2s, v13.2s bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa - umlal v8.2D, v5.2S, v11.2S + umlal v8.2d, v5.2s, v11.2s add x26, x24, x24 - umlal v8.2D, v16.2S, v18.2S + umlal v8.2d, v16.2s, v18.2s stp x14, x29, [tmpa+16] - umlal v8.2D, v7.2S, v23.2S + umlal v8.2d, v7.2s, v23.2s add w24, w30, w30, lsl #1; - usra v25.2D, v29.2D, #26 + usra v25.2d, v29.2d, #26 add w24, w24, w30, lsl #4 umull x29, w15, w15 - umlal v27.2D, v2.2S, v14.2S + umlal v27.2d, v2.2s, v14.2s umull x3, w15, w13 - umlal v27.2D, v5.2S, v13.2S + umlal v27.2d, v5.2s, v13.2s add x21, x20, x20 - umlal v24.2D, v15.2S, v14.2S + umlal v24.2d, v15.2s, v14.2s umull x5, w19, w21 - umlal v24.2D, v4.2S, v13.2S + umlal v24.2d, v4.2s, v13.2s and x11, x1, #0x3ffffff - usra v8.2D, v25.2D, #25 + usra v8.2d, v25.2d, #25 and x1, x0, #0x1ffffff - umlal v27.2D, v16.2S, v11.2S + umlal v27.2d, v16.2s, v11.2s umaddl x23, w17, w13, x5 - umlal v27.2D, v7.2S, v18.2S + umlal v27.2d, v7.2s, v18.2s add x5, x30, x30 - usra v0.2D, v8.2D, #26 + usra v0.2d, v8.2d, #26 add x0, x15, x15 - umlal v24.2D, v10.2S, v11.2S + umlal v24.2d, v10.2s, v11.2s umaddl x23, w7, w0, x23 - umlal v24.2D, v6.2S, v18.2S + umlal v24.2d, v6.2s, v18.2s lsr x30, x7, #32 - usra v27.2D, v0.2D, #25 + usra v27.2d, v0.2d, #25 add x16, x30, x30 - and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + and v20.16b, v8.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad umaddl x15, w30, w16, x23 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 add w23, w8, w8, lsl #1; - usra v24.2D, v27.2D, #26 + usra v24.2d, v27.2d, #26 add w23, w23, w8, lsl #4 umaddl x14, w19, w5, x3 - and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + and v8.16b, v27.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad add x28, x8, x8 - and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + and v27.16b, v0.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad umaddl x8, w8, w23, x15 - and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + and v5.16b, v24.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad umaddl x3, w2, w28, x14 - umlal v22.2D, v15.2S, v28.2S + umlal v22.2d, v15.2s, v28.2s bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa - uzp1 v5.4S, v8.4S, v5.4S + uzp1 v5.4s, v8.4s, v5.4s umaddl x14, w24, w5, x29 umaddl x5, w19, w28, x14 ldr d18, [mask1] mov v18.d[1], v18.d[0] umaddl x15, w7, w26, x3 - mul v12.2S, v13.2S, v31.2S - umlal v21.2D, v16.2S, v13.2S + mul v12.2s, v13.2s, v31.2s + umlal v21.2d, v16.2s, v13.2s stp x9, x11, [tmpa+0] - umlal v21.2D, v7.2S, v11.2S + umlal v21.2d, v7.2s, v11.2s umaddl x29, w17, w26, x5 - umlal v22.2D, v4.2S, v14.2S + umlal v22.2d, v4.2s, v14.2s add w14, w20, w20, lsl #1; - umlal v22.2D, v10.2S, v13.2S + umlal v22.2d, v10.2s, v13.2s add w14, w14, w20, lsl #4 umull x3, w19, w0 - umlal v22.2D, v6.2S, v11.2S + umlal v22.2d, v6.2s, v11.2s umaddl x29, w7, w21, x29 - usra v21.2D, v24.2D, #25 + usra v21.2d, v24.2d, #25 umaddl x11, w20, w14, x12 - and v0.16B, v25.16B, v23.16B + and v0.16b, v25.16b, v23.16b umaddl x5, w30, w21, x15 - and v14.16B, v29.16B, v30.16B + and v14.16b, v29.16b, v30.16b umaddl x12, w16, w13, x29 - usra v22.2D, v21.2D, #26 + usra v22.2d, v21.2d, #26 umaddl x29, w17, w16, x3 - umlal v3.2D, v7.2S, v12.2S + umlal v3.2d, v7.2s, v12.2s add x9, x26, x26 - and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + and v1.16b, v21.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad add x27, x5, x12, lsr #26 - bic v8.16B, v22.16B, v23.16B + bic v8.16b, v22.16b, v23.16b umaddl x29, w7, w7, x29 - and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + and v17.16b, v22.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad add x5, x25, x27, lsr #25 - usra v3.2D, v8.2D, #25 + usra v3.2d, v8.2d, #25 umaddl x25, w24, w9, x8 - umlal v9.2D, v10.2S, v26.2S + umlal v9.2d, v10.2s, v26.2s add x8, x13, x13 - trn1 v22.4S, v1.4S, v17.4S + trn1 v22.4s, v1.4s, v17.4s umaddl x11, w10, w8, x11 - usra v3.2D, v8.2D, #24 + usra v3.2d, v8.2d, #24 umull x20, w19, w16 - add v26.2S, v22.2S, v18.2S + add v26.2s, v22.2s, v18.2s ldr d28, [mask2] - umlal v9.2D, v6.2S, v12.2S + umlal v9.2d, v6.2s, v12.2s umaddl x3, w23, w0, x11 - usra v3.2D, v8.2D, #21 + usra v3.2d, v8.2d, #21 umaddl x29, w10, w26, x29 - uzp1 v11.4S, v20.4S, v27.4S + uzp1 v11.4s, v20.4s, v27.4s umaddl x20, w2, w4, x20 umaddl x9, w10, w21, x20 mov v17.d[0], v22.d[1] - usra v9.2D, v3.2D, #26 + usra v9.2d, v3.2d, #26 umull x15, w19, w13 - and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + and v7.16b, v3.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad add x11, x16, x16 - uzp2 v1.4S, v11.4S, v5.4S + uzp2 v1.4s, v11.4s, v5.4s umaddl x20, w23, w13, x9 - and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + and v8.16b, v9.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad umaddl x9, w2, w0, x15 - usra v14.2D, v9.2D, #25 + usra v14.2d, v9.2d, #25 and x6, x6, #0x3ffffff - uzp1 v7.4S, v7.4S, v8.4S + uzp1 v7.4s, v7.4s, v8.4s umaddl x29, w23, w21, x29 - uzp1 v27.4S, v11.4S, v5.4S + uzp1 v27.4s, v11.4s, v5.4s umull x15, w19, w26 - usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + usra v0.2d, v14.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad add x6, x6, x22, lsr #25 - and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + and v3.16b, v14.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad bic x22, x27, #0x1ffffff - sub v2.2S, v26.2S, v17.2S - add v9.2S, v22.2S, v17.2S - uzp1 v14.4S, v3.4S, v0.4S + sub v2.2s, v26.2s, v17.2s + add v9.2s, v22.2s, v17.2s + uzp1 v14.4s, v3.4s, v0.4s umaddl x2, w2, w21, x15 - add v5.4S, v27.4S, v18.4S + add v5.4s, v27.4s, v18.4s add x5, x5, x22, lsr #24 - zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + zip1 v22.2s, v2.2s, v9.2s // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 mov v18.b[0], v28.b[0] - uzp1 v8.4S, v7.4S, v14.4S + uzp1 v8.4s, v7.4s, v14.4s add x22, x5, x22, lsr #21 - uzp2 v3.4S, v7.4S, v14.4S + uzp2 v3.4s, v7.4s, v14.4s umaddl x5, w7, w16, x9 - add v25.4S, v8.4S, v18.4S + add v25.4s, v8.4s, v18.4s umaddl x15, w14, w0, x22 - add v12.4S, v27.4S, v1.4S + add v12.4s, v27.4s, v1.4s add x9, x17, x17 - sub v14.4S, v5.4S, v1.4S + sub v14.4s, v5.4s, v1.4s umull x19, w19, w17 - sub v18.4S, v25.4S, v3.4S + sub v18.4s, v25.4s, v3.4s ldr x22, [tmpa+8] - add v20.4S, v8.4S, v3.4S + add v20.4s, v8.4s, v3.4s umaddl x15, w10, w11, x15 - zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + zip1 v16.4s, v14.4s, v12.4s // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 umaddl x14, w14, w13, x19 - zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + zip2 v14.4s, v14.4s, v12.4s // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 and x17, x27, #0x1ffffff - zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + zip2 v0.4s, v18.4s, v20.4s // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 umaddl x15, w23, w4, x15 - zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + zip1 v1.4s, v18.4s, v20.4s // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 umaddl x10, w10, w0, x14 - zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 - shl v24.2S, v0.2S, #1 + zip2 v5.2s, v2.2s, v9.2s // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2s, v0.2s, #1 mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 - shl v26.2S, v22.2S, #1 - shl v17.2S, v16.2S, #1 + shl v26.2s, v22.2s, #1 + shl v17.2s, v16.2s, #1 mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 - shl v7.2S, v5.2S, #1 - shl v18.2S, v19.2S, #1 - umull v11.2D, v1.2S, v24.2S + shl v7.2s, v5.2s, #1 + shl v18.2s, v19.2s, #1 + umull v11.2d, v1.2s, v24.2s umaddl x19, w23, w16, x10 - umull v6.2D, v1.2S, v17.2S + umull v6.2d, v1.2s, v17.2s umaddl x10, w7, w13, x2 mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 - umull v9.2D, v1.2S, v26.2S + umull v9.2d, v1.2s, v26.2s ldr x13, [tmpa+0] - shl v28.2S, v15.2S, #1 - shl v3.2S, v10.2S, #1 + shl v28.2s, v15.2s, #1 + shl v3.2s, v10.2s, #1 ldr x14, [tmpa+16] - mul v12.2S, v10.2S, v31.2S - umull v25.2D, v1.2S, v7.2S + mul v12.2s, v10.2s, v31.2s + umull v25.2d, v1.2s, v7.2s ldr x2, [tmpa+24] - umlal v6.2D, v18.2S, v28.2S + umlal v6.2d, v18.2s, v28.2s umaddl x27, w30, w0, x10 umaddl x16, w24, w0, x20 - shl v13.2S, v14.2S, #1 + shl v13.2s, v14.2s, #1 umaddl x5, w23, w26, x5 - mul v2.2S, v22.2S, v31.2S - umull v21.2D, v1.2S, v13.2S + mul v2.2s, v22.2s, v31.2s + umull v21.2d, v1.2s, v13.2s umaddl x23, w24, w8, x29 - umlal v11.2D, v18.2S, v19.2S + umlal v11.2d, v18.2s, v19.2s mov x10, #0x07fffffe07fffffe sub x10, x10, #2 umaddl x26, w24, w21, x5 - mul v29.2S, v14.2S, v31.2S - umlal v25.2D, v19.2S, v26.2S + mul v29.2s, v14.2s, v31.2s + umlal v25.2d, v19.2s, v26.2s add x7, x1, x6, lsr #26 - mul v20.2S, v4.2S, v31.2S + mul v20.2s, v4.2s, v31.2s and x6, x6, #0x3ffffff - shl v8.2S, v18.2S, #1 - shl v4.2S, v4.2S, #1 - umlal v11.2D, v29.2S, v14.2S + shl v8.2s, v18.2s, #1 + shl v4.2s, v4.2s, #1 + umlal v11.2d, v29.2s, v14.2s bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa - umlal v25.2D, v0.2S, v3.2S + umlal v25.2d, v0.2s, v3.2s umaddl x0, w24, w4, x19 - umlal v25.2D, v15.2S, v13.2S + umlal v25.2d, v15.2s, v13.2s str x6, [tmpa+32] - umlal v21.2D, v18.2S, v4.2S + umlal v21.2d, v18.2s, v4.2s umaddl x8, w24, w11, x3 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s ldr x30, [tmpa+32] - mul v14.2S, v5.2S, v31.2S + mul v14.2s, v5.2s, v31.2s add x2, x2, x10 - shl v5.2S, v28.2S, #1 - shl v27.2S, v4.2S, #1 - umlal v6.2D, v0.2S, v0.2S + shl v5.2s, v28.2s, #1 + shl v27.2s, v4.2s, #1 + umlal v6.2d, v0.2s, v0.2s umaddl x11, w24, w9, x15 - umlal v6.2D, v12.2S, v3.2S + umlal v6.2d, v12.2s, v3.2s add x4, x30, x10 - umlal v11.2D, v14.2S, v5.2S + umlal v11.2d, v14.2s, v5.2s add x3, x22, x10 - umlal v11.2D, v2.2S, v17.2S + umlal v11.2d, v2.2s, v17.2s add x6, x0, x11, lsr #26 - umlal v11.2D, v12.2S, v27.2S + umlal v11.2d, v12.2s, v27.2s add x14, x14, x10 - umlal v6.2D, v14.2S, v27.2S + umlal v6.2d, v14.2s, v27.2s add x8, x8, x6, lsr #25 - umlal v6.2D, v2.2S, v13.2S + umlal v6.2d, v2.2s, v13.2s movk x10, #0xffb4 - umlal v25.2D, v16.2S, v4.2S + umlal v25.2d, v16.2s, v4.2s add x29, x16, x8, lsr #26 - umull v27.2D, v1.2S, v3.2S + umull v27.2d, v1.2s, v3.2s and x11, x11, #0x3ffffff - umlal v9.2D, v18.2S, v3.2S + umlal v9.2d, v18.2s, v3.2s add x19, x13, x10 - umlal v9.2D, v0.2S, v13.2S + umlal v9.2d, v0.2s, v13.2s and x5, x8, #0x3ffffff - umlal v9.2D, v28.2S, v4.2S + umlal v9.2d, v28.2s, v4.2s bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb - umlal v9.2D, v16.2S, v16.2S + umlal v9.2d, v16.2s, v16.2s umaddl x30, w24, w28, x27 - umlal v9.2D, v14.2S, v7.2S + umlal v9.2d, v14.2s, v7.2s sub x13, x19, x11 - umull v10.2D, v1.2S, v18.2S + umull v10.2d, v1.2s, v18.2s add x7, x23, x29, lsr #25 - umlal v21.2D, v28.2S, v15.2S + umlal v21.2d, v28.2s, v15.2s lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e - umlal v21.2D, v2.2S, v22.2S + umlal v21.2d, v2.2s, v22.2s add x0, x26, x7, lsr #26 - usra v25.2D, v9.2D, #26 + usra v25.2d, v9.2d, #26 and x20, x7, #0x3ffffff - umull v22.2D, v1.2S, v1.2S + umull v22.2d, v1.2s, v1.2s add x8, x25, x0, lsr #25 - umull v7.2D, v1.2S, v28.2S + umull v7.2d, v1.2s, v28.2s and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt - bic v18.16B, v25.16B, v23.16B + bic v18.16b, v25.16b, v23.16b and x19, x8, #0x3ffffff - and v16.16B, v9.16B, v30.16B + and v16.16b, v9.16b, v30.16b and x7, x12, #0x3ffffff - usra v22.2D, v18.2D, #25 + usra v22.2d, v18.2d, #25 add x10, x30, x8, lsr #26 - umlal v7.2D, v19.2S, v24.2S + umlal v7.2d, v19.2s, v24.2s bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb - and v9.16B, v25.16B, v23.16B + and v9.16b, v25.16b, v23.16b add x27, x7, x10, lsr #25 - usra v22.2D, v18.2D, #24 + usra v22.2d, v18.2d, #24 mov x21, #60833 lsl x21, x21, #1 add x15, x17, x27, lsr #26 - shl v25.2S, v3.2S, #1 - umlal v7.2D, v14.2S, v17.2S + shl v25.2s, v3.2s, #1 + umlal v7.2d, v14.2s, v17.2s and x29, x27, #0x3ffffff - usra v22.2D, v18.2D, #21 + usra v22.2d, v18.2d, #21 bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt - umlal v10.2D, v14.2S, v24.2S + umlal v10.2d, v14.2s, v24.2s and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt - umlal v10.2D, v2.2S, v28.2S + umlal v10.2d, v2.2s, v28.2s sub x6, x3, x5 - umlal v10.2D, v12.2S, v17.2S + umlal v10.2d, v12.2s, v17.2s umaddl x25, w16, w21, x17 - umlal v10.2D, v29.2S, v4.2S + umlal v10.2d, v29.2s, v4.2s mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt - umlal v22.2D, v20.2S, v4.2S + umlal v22.2d, v20.2s, v4.2s lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e - umlal v22.2D, v14.2S, v8.2S + umlal v22.2d, v14.2s, v8.2s and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt - umlal v22.2D, v2.2S, v24.2S + umlal v22.2d, v2.2s, v24.2s stp x11, x5, [tmpb+0] - umlal v22.2D, v12.2S, v5.2S + umlal v22.2d, v12.2s, v5.2s bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb - umlal v22.2D, v29.2S, v17.2S + umlal v22.2d, v29.2s, v17.2s umaddl x12, w6, w21, x12 - umull v18.2D, v1.2S, v4.2S + umull v18.2d, v1.2s, v4.2s bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb - umlal v7.2D, v2.2S, v4.2S + umlal v7.2d, v2.2s, v4.2s sub x7, x14, x20 - umlal v27.2D, v19.2S, v13.2S + umlal v27.2d, v19.2s, v13.2s mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt - usra v10.2D, v22.2D, #26 + usra v10.2d, v22.2d, #26 lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e - umlal v18.2D, v19.2S, v17.2S + umlal v18.2d, v19.2s, v17.2s and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt - umlal v7.2D, v12.2S, v13.2S + umlal v7.2d, v12.2s, v13.2s sub x5, x2, x19 - usra v11.2D, v10.2D, #25 + usra v11.2d, v10.2d, #25 mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt - umlal v27.2D, v0.2S, v4.2S - umlal v21.2D, v14.2S, v25.2S + umlal v27.2d, v0.2s, v4.2s + umlal v21.2d, v14.2s, v25.2s sub x23, x4, x29 - usra v7.2D, v11.2D, #26 + usra v7.2d, v11.2d, #26 mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt - umlal v18.2D, v0.2S, v28.2S + umlal v18.2d, v0.2s, v28.2s lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e - umlal v27.2D, v15.2S, v17.2S + umlal v27.2d, v15.2s, v17.2s str x29, [tmpb+32] - usra v6.2D, v7.2D, #25 + usra v6.2d, v7.2d, #25 mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt - and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + and v0.16b, v22.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 umaddl x27, w26, w21, x1 - umlal v18.2D, v14.2S, v13.2S + umlal v18.2d, v14.2s, v13.2s umaddl x30, w23, w21, x0 - umlal v18.2D, v2.2S, v3.2S + umlal v18.2d, v2.2s, v3.2s lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e - and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 - and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + and v4.16b, v6.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16b, v10.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 umaddl x4, w14, w21, x24 ldr x0, [tmpa+0] mov v0.s[1], w0 lsr x0, x0, #32 mov v1.s[1], w0 umaddl x9, w7, w21, x8 - usra v18.2D, v6.2D, #26 + usra v18.2d, v6.2d, #26 umaddl x24, w10, w21, x28 - and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + and v3.16b, v7.16b, v23.16b // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 umaddl x8, w22, w21, x15 - umlal v27.2D, v14.2S, v26.2S + umlal v27.2d, v14.2s, v26.2s umaddl x15, w13, w21, x17 - usra v21.2D, v18.2D, #25 + usra v21.2d, v18.2d, #25 stp x20, x19, [tmpb+16] - and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + and v2.16b, v11.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 lsr x29, x8, #25 ldr x3, [tmpb+0] mov v10.s[1], w3 lsr x3, x3, #32 mov v11.s[1], w3 add x17, x15, x29 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 add x28, x17, x29, lsl #1 - and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and v6.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 and x20, x8, #0x1ffffff - and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + and v5.16b, v18.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 add x17, x28, x29, lsl #4 - and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + and v7.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 ldr x3, [tmpb+8] mov v22.s[1], w3 lsr x3, x3, #32 @@ -872,7 +872,7 @@ curve25519_x25519_scalarloop: lsr x15, x15, #32 mov v11.s[0], w15 and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce - usra v16.2D, v27.2D, #25 + usra v16.2d, v27.2d, #25 add x8, x12, x29, lsr #25 ldr x3, [tmpb+16] mov v14.s[1], w3 @@ -884,7 +884,7 @@ curve25519_x25519_scalarloop: lsr x15, x15, #32 mov v23.s[0], w15 add x28, x27, x8, lsr #26 - and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + and v8.16b, v16.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 umull x1, w12, w10 ldr x3, [tmpb+24] mov v17.s[1], w3 @@ -896,7 +896,7 @@ curve25519_x25519_scalarloop: lsr x15, x15, #32 mov v15.s[0], w15 umaddl x19, w5, w21, x2 - usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + usra v9.2d, v16.2d, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 add x2, x4, x25, lsr #26 ldr x3, [tmpb+32] mov v24.s[1], w3 @@ -908,7 +908,7 @@ curve25519_x25519_scalarloop: lsr x15, x15, #32 mov v18.s[0], w15 add x29, x19, x2, lsr #25 - umull v26.2D, v0.2S, v23.2S + umull v26.2d, v0.2s, v23.2s and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce ldr x0, [tmpa+8] mov v2.s[1], w0 @@ -920,20 +920,20 @@ curve25519_x25519_scalarloop: lsr x15, x15, #32 mov v25.s[0], w15 add x17, x24, x29, lsr #26 - umull v29.2D, v1.2S, v18.2S + umull v29.2d, v1.2s, v18.2s and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce - umull v20.2D, v0.2S, v15.2S + umull v20.2d, v0.2s, v15.2s add x19, x30, x17, lsr #25 and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce - mul v12.2S, v25.2S, v31.2S + mul v12.2s, v25.2s, v31.2s ldr x0, [tmpa+16] mov v4.s[1], w0 lsr x0, x0, #32 mov v5.s[1], w0 add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v26.2D, v2.2S, v11.2S + umlal v26.2d, v2.2s, v11.2s add w28, w3, w3, lsl #1; - umlal v20.2D, v2.2S, v23.2S + umlal v20.2d, v2.2s, v23.2s add w28, w28, w3, lsl #4 umull x8, w12, w5 ldr x0, [tmpa+24] @@ -941,12 +941,12 @@ curve25519_x25519_scalarloop: lsr x0, x0, #32 mov v7.s[1], w0 and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce - mul v16.2S, v18.2S, v31.2S + mul v16.2s, v18.2s, v31.2s add w17, w4, w4, lsl #1; - umull v21.2D, v1.2S, v15.2S + umull v21.2d, v1.2s, v15.2s add w17, w17, w4, lsl #4 umaddl x25, w21, w7, x8 - umlal v20.2D, v4.2S, v11.2S + umlal v20.2d, v4.2s, v11.2s add w8, w21, w21, lsl #1; ldr x0, [tmpa+32] add w8, w8, w21, lsl #4 @@ -954,300 +954,300 @@ curve25519_x25519_scalarloop: lsr x0, x0, #32 mov v9.s[1], w0 and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce - umlal v29.2D, v3.2S, v15.2S + umlal v29.2d, v3.2s, v15.2s umaddl x24, w2, w6, x25 - umull v13.2D, v0.2S, v25.2S + umull v13.2d, v0.2s, v25.2s umaddl x25, w2, w7, x27 umaddl x0, w3, w6, x25 - mul v19.2S, v15.2S, v31.2S - umull v27.2D, v0.2S, v18.2S + mul v19.2s, v15.2s, v31.2s + umull v27.2d, v0.2s, v18.2s umaddl x20, w3, w13, x24 - umlal v20.2D, v6.2S, v12.2S + umlal v20.2d, v6.2s, v12.2s umaddl x24, w21, w14, x1 - umlal v13.2D, v2.2S, v18.2S + umlal v13.2d, v2.2s, v18.2s umaddl x9, w4, w13, x0 - umull v25.2D, v0.2S, v11.2S + umull v25.2d, v0.2s, v11.2s umaddl x20, w17, w23, x20 - umlal v27.2D, v2.2S, v15.2S + umlal v27.2d, v2.2s, v15.2s umaddl x0, w2, w26, x24 - umull v28.2D, v1.2S, v11.2S + umull v28.2d, v1.2s, v11.2s umull x24, w17, w5 - umlal v29.2D, v5.2S, v23.2S + umlal v29.2d, v5.2s, v23.2s umaddl x9, w11, w22, x9 - umlal v13.2D, v4.2S, v15.2S + umlal v13.2d, v4.2s, v15.2s umaddl x27, w3, w16, x0 - umlal v27.2D, v4.2S, v23.2S + umlal v27.2d, v4.2s, v23.2s umull x0, w17, w14 - umlal v27.2D, v6.2S, v11.2S + umlal v27.2d, v6.2s, v11.2s umull x4, w12, w14 - umlal v27.2D, v8.2S, v12.2S + umlal v27.2d, v8.2s, v12.2s umaddl x25, w11, w10, x20 - umlal v27.2D, v1.2S, v17.2S + umlal v27.2d, v1.2s, v17.2s umaddl x0, w28, w10, x0 - umlal v13.2D, v6.2S, v23.2S + umlal v13.2d, v6.2s, v23.2s umull x3, w17, w6 - umlal v13.2D, v8.2S, v11.2S + umlal v13.2d, v8.2s, v11.2s umaddl x1, w21, w26, x4 - umlal v20.2D, v8.2S, v16.2S + umlal v20.2d, v8.2s, v16.2s umaddl x4, w2, w13, x24 - umlal v28.2D, v3.2S, v12.2S + umlal v28.2d, v3.2s, v12.2s umaddl x20, w28, w7, x3 - umlal v29.2D, v7.2S, v11.2S + umlal v29.2d, v7.2s, v11.2s and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v29.2D, v9.2S, v12.2S + umlal v29.2d, v9.2s, v12.2s umaddl x19, w17, w22, x27 add w27, w2, w2, lsl #1; - mul v18.2S, v24.2S, v31.2S + mul v18.2s, v24.2s, v31.2s add w27, w27, w2, lsl #4 - umlal v21.2D, v3.2S, v23.2S + umlal v21.2d, v3.2s, v23.2s umull x24, w17, w7 - umlal v13.2D, v1.2S, v24.2S + umlal v13.2d, v1.2s, v24.2s add x19, x19, x19 - shl v29.2D, v29.2D, #1 + shl v29.2d, v29.2d, #1 umaddl x1, w2, w16, x1 - umull v15.2D, v1.2S, v23.2S + umull v15.2d, v1.2s, v23.2s umaddl x0, w27, w22, x0 - umlal v29.2D, v0.2S, v24.2S + umlal v29.2d, v0.2s, v24.2s umaddl x2, w28, w5, x24 - mul v24.2S, v23.2S, v31.2S + mul v24.2s, v23.2s, v31.2s umaddl x4, w28, w23, x4 - umlal v21.2D, v5.2S, v11.2S + umlal v21.2d, v5.2s, v11.2s umaddl x24, w27, w5, x20 - umlal v20.2D, v1.2S, v14.2S + umlal v20.2d, v1.2s, v14.2s umaddl x20, w11, w23, x19 - umlal v26.2D, v4.2S, v12.2S + umlal v26.2d, v4.2s, v12.2s umaddl x19, w27, w23, x2 - umlal v26.2D, v6.2S, v16.2S + umlal v26.2d, v6.2s, v16.2s umaddl x2, w21, w6, x4 - umlal v29.2D, v2.2S, v17.2S + umlal v29.2d, v2.2s, v17.2s umaddl x24, w8, w23, x24 - umlal v15.2D, v3.2S, v11.2S + umlal v15.2d, v3.2s, v11.2s umaddl x0, w21, w16, x0 umaddl x4, w21, w13, x19 - mul v23.2S, v11.2S, v31.2S - umlal v20.2D, v3.2S, v22.2S + mul v23.2s, v11.2s, v31.2s + umlal v20.2d, v3.2s, v22.2s umaddl x2, w12, w7, x2 - umlal v20.2D, v5.2S, v10.2S + umlal v20.2d, v5.2s, v10.2s umaddl x19, w12, w26, x0 - umlal v29.2D, v4.2S, v14.2S + umlal v29.2d, v4.2s, v14.2s umaddl x0, w12, w13, x24 - umlal v26.2D, v8.2S, v19.2S + umlal v26.2d, v8.2s, v19.2s umaddl x20, w15, w5, x20 - umlal v26.2D, v1.2S, v22.2S + umlal v26.2d, v1.2s, v22.2s umaddl x21, w15, w10, x9 - umlal v26.2D, v3.2S, v10.2S + umlal v26.2d, v3.2s, v10.2s and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce - umlal v29.2D, v6.2S, v22.2S + umlal v29.2d, v6.2s, v22.2s umaddl x20, w30, w7, x20 umaddl x1, w28, w22, x1 add x24, x19, x19 - umull v11.2D, v1.2S, v12.2S + umull v11.2d, v1.2s, v12.2s add w19, w3, w3, lsl #1; - umlal v26.2D, v5.2S, v18.2S + umlal v26.2d, v5.2s, v18.2s add w19, w19, w3, lsl #4 umaddl x20, w9, w6, x20 - umlal v29.2D, v8.2S, v10.2S + umlal v29.2d, v8.2s, v10.2s add w29, w9, w9, lsl #1; - umlal v13.2D, v3.2S, v17.2S + umlal v13.2d, v3.2s, v17.2s add w29, w29, w9, lsl #4 umaddl x2, w19, w10, x2 - umlal v11.2D, v3.2S, v16.2S + umlal v11.2d, v3.2s, v16.2s umaddl x21, w30, w14, x21 - umlal v11.2D, v5.2S, v19.2S + umlal v11.2d, v5.2s, v19.2s umaddl x20, w3, w13, x20 - umlal v11.2D, v7.2S, v24.2S + umlal v11.2d, v7.2s, v24.2s umaddl x2, w29, w22, x2 - umlal v11.2D, v9.2S, v23.2S + umlal v11.2d, v9.2s, v23.2s umaddl x21, w9, w26, x21 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 umaddl x1, w17, w10, x1 - umlal v13.2D, v5.2S, v14.2S + umlal v13.2d, v5.2s, v14.2s umaddl x24, w19, w5, x24 - umlal v27.2D, v3.2S, v14.2S + umlal v27.2d, v3.2s, v14.2s umaddl x21, w3, w16, x21 - shl v11.2D, v11.2D, #1 + shl v11.2d, v11.2d, #1 add w3, w30, w30, lsl #1; - umlal v28.2D, v5.2S, v16.2S + umlal v28.2d, v5.2s, v16.2s add w3, w3, w30, lsl #4 umaddl x24, w29, w23, x24 - umlal v28.2D, v7.2S, v19.2S + umlal v28.2d, v7.2s, v19.2s add x1, x1, x1 - umlal v28.2D, v9.2S, v24.2S + umlal v28.2d, v9.2s, v24.2s umaddl x1, w11, w5, x1 - umlal v15.2D, v5.2S, v12.2S + umlal v15.2d, v5.2s, v12.2s umaddl x24, w30, w13, x24 - umlal v15.2D, v7.2S, v16.2S + umlal v15.2d, v7.2s, v16.2s umaddl x25, w15, w14, x25 - umlal v15.2D, v9.2S, v19.2S + umlal v15.2d, v9.2s, v19.2s umaddl x1, w15, w7, x1 - shl v28.2D, v28.2D, #1 + shl v28.2d, v28.2d, #1 umaddl x24, w15, w6, x24 - umlal v21.2D, v7.2S, v12.2S + umlal v21.2d, v7.2s, v12.2s umaddl x2, w30, w16, x2 - umlal v21.2D, v9.2S, v16.2S + umlal v21.2d, v9.2s, v16.2s umaddl x25, w30, w26, x25 - shl v15.2D, v15.2D, #1 + shl v15.2d, v15.2d, #1 umaddl x30, w30, w6, x1 - umlal v28.2D, v0.2S, v22.2S + umlal v28.2d, v0.2s, v22.2s umaddl x1, w15, w26, x2 - umlal v28.2D, v2.2S, v10.2S + umlal v28.2d, v2.2s, v10.2s umaddl x2, w9, w16, x25 - shl v21.2D, v21.2D, #1 + shl v21.2d, v21.2d, #1 umaddl x24, w11, w7, x24 - umlal v15.2D, v0.2S, v14.2S + umlal v15.2d, v0.2s, v14.2s umaddl x1, w11, w14, x1 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s umaddl x25, w9, w13, x30 - umlal v28.2D, v4.2S, v18.2S + umlal v28.2d, v4.2s, v18.2s umaddl x0, w19, w26, x0 - umlal v25.2D, v2.2S, v12.2S + umlal v25.2d, v2.2s, v12.2s add x1, x1, x24, lsr #26 - umlal v25.2D, v4.2S, v16.2S + umlal v25.2d, v4.2s, v16.2s umaddl x30, w19, w22, x2 - umlal v21.2D, v2.2S, v14.2S + umlal v21.2d, v2.2s, v14.2s umaddl x4, w12, w6, x4 - mul v14.2S, v14.2S, v31.2S + mul v14.2s, v14.2s, v31.2s umaddl x25, w19, w23, x25 and x2, x1, #0x1ffffff - mul v16.2S, v17.2S, v31.2S - umlal v25.2D, v6.2S, v19.2S + mul v16.2s, v17.2s, v31.2s + umlal v25.2d, v6.2s, v19.2s umaddl x9, w19, w14, x4 - umlal v13.2D, v7.2S, v22.2S + umlal v13.2d, v7.2s, v22.2s add x25, x25, x1, lsr #25 - umlal v21.2D, v4.2S, v22.2S + umlal v21.2d, v4.2s, v22.2s umaddl x0, w29, w14, x0 - umlal v26.2D, v7.2S, v16.2S + umlal v26.2d, v7.2s, v16.2s add x30, x30, x25, lsr #26 - umlal v26.2D, v9.2S, v14.2S + umlal v26.2d, v9.2s, v14.2s add w1, w15, w15, lsl #1; - umlal v28.2D, v6.2S, v16.2S + umlal v28.2d, v6.2s, v16.2s add w1, w1, w15, lsl #4 add x4, x20, x30, lsr #25 - umlal v28.2D, v8.2S, v14.2S + umlal v28.2d, v8.2s, v14.2s and x25, x25, #0x3ffffff - umlal v15.2D, v2.2S, v22.2S + umlal v15.2d, v2.2s, v22.2s add x21, x21, x4, lsr #26 - umlal v11.2D, v0.2S, v10.2S + umlal v11.2d, v0.2s, v10.2s bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 - umlal v11.2D, v2.2S, v18.2S + umlal v11.2d, v2.2s, v18.2s bic x30, x21, #0x3ffffff - usra v26.2D, v28.2D, #26 + usra v26.2d, v28.2d, #26 lsr x20, x30, #26 - umlal v15.2D, v4.2S, v10.2S + umlal v15.2d, v4.2s, v10.2s add x20, x20, x30, lsr #25 - umlal v15.2D, v6.2S, v18.2S + umlal v15.2d, v6.2s, v18.2s umaddl x9, w29, w10, x9 - umlal v15.2D, v8.2S, v16.2S + umlal v15.2d, v8.2s, v16.2s add x30, x20, x30, lsr #22 - umlal v27.2D, v5.2S, v22.2S + umlal v27.2d, v5.2s, v22.2s umull x20, w17, w26 - umlal v20.2D, v7.2S, v18.2S + umlal v20.2d, v7.2s, v18.2s umaddl x30, w17, w16, x30 - umlal v20.2D, v9.2S, v16.2S + umlal v20.2d, v9.2s, v16.2s umaddl x17, w3, w10, x0 - usra v15.2D, v26.2D, #25 + usra v15.2d, v26.2d, #25 umaddl x0, w28, w14, x20 - umlal v27.2D, v7.2S, v10.2S + umlal v27.2d, v7.2s, v10.2s umaddl x20, w28, w26, x30 - umlal v27.2D, v9.2S, v18.2S + umlal v27.2d, v9.2s, v18.2s add w28, w12, w12, lsl #1; - usra v20.2D, v15.2D, #26 + usra v20.2d, v15.2d, #26 add w28, w28, w12, lsl #4 umaddl x30, w27, w10, x0 - and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + and v17.16b, v15.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 umaddl x27, w27, w14, x20 umaddl x0, w8, w10, x27 - mul v12.2S, v22.2S, v31.2S - and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + mul v12.2s, v22.2s, v31.2s + and v15.16b, v20.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 umaddl x14, w3, w22, x9 - umlal v21.2D, v6.2S, v10.2S + umlal v21.2d, v6.2s, v10.2s umaddl x27, w8, w22, x30 - trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + trn1 v15.4s, v17.4s, v15.4s // FINAL z3 umaddl x10, w28, w22, x0 - umlal v11.2D, v4.2S, v16.2S + umlal v11.2d, v4.2s, v16.2s umaddl x30, w15, w16, x14 - and v26.16B, v26.16B, v23.16B + and v26.16b, v26.16b, v23.16b umaddl x28, w12, w16, x27 - umlal v21.2D, v8.2S, v18.2S + umlal v21.2d, v8.2s, v18.2s add x10, x10, x10 - umlal v25.2D, v8.2S, v24.2S + umlal v25.2d, v8.2s, v24.2s umaddl x20, w19, w6, x10 - umlal v25.2D, v1.2S, v10.2S + umlal v25.2d, v1.2s, v10.2s add x28, x28, x28 - umlal v25.2D, v3.2S, v18.2S + umlal v25.2d, v3.2s, v18.2s umaddl x28, w19, w7, x28 - usra v21.2D, v20.2D, #25 + usra v21.2d, v20.2d, #25 umaddl x0, w29, w7, x20 - umlal v11.2D, v6.2S, v14.2S + umlal v11.2d, v6.2s, v14.2s umaddl x10, w11, w26, x30 - umlal v13.2D, v9.2S, v10.2S + umlal v13.2d, v9.2s, v10.2s umaddl x19, w29, w5, x28 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 umaddl x0, w3, w5, x0 - umlal v25.2D, v5.2S, v16.2S + umlal v25.2d, v5.2s, v16.2s umaddl x20, w1, w22, x17 - and v20.16B, v28.16B, v30.16B + and v20.16b, v28.16b, v30.16b umaddl x29, w3, w23, x19 - usra v29.2D, v27.2D, #25 + usra v29.2d, v27.2d, #25 umaddl x3, w1, w23, x0 - and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 - umlal v11.2D, v8.2S, v12.2S + and v27.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2d, v8.2s, v12.2s umaddl x12, w15, w13, x29 - usra v13.2D, v29.2D, #26 + usra v13.2d, v29.2d, #26 umaddl x7, w11, w13, x3 - trn1 v6.4S, v6.4S, v7.4S + trn1 v6.4s, v6.4s, v7.4s umaddl x17, w11, w16, x20 - umlal v25.2D, v7.2S, v14.2S + umlal v25.2d, v7.2s, v14.2s and x23, x4, #0x3ffffff - bic v19.16B, v13.16B, v23.16B + bic v19.16b, v13.16b, v23.16b umaddl x19, w11, w6, x12 - and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v13.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 add x3, x17, x7, lsr #26 - usra v11.2D, v19.2D, #25 - trn1 v2.4S, v2.4S, v3.4S + usra v11.2d, v19.2d, #25 + trn1 v2.4s, v2.4s, v3.4s add x17, x19, x3, lsr #25 - and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and v13.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 and x5, x7, #0x3ffffff - usra v11.2D, v19.2D, #24 + usra v11.2d, v19.2d, #24 add x7, x10, x17, lsr #26 - trn1 v0.4S, v0.4S, v1.4S + trn1 v0.4s, v0.4s, v1.4s and x19, x24, #0x3ffffff - and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v29.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 add x29, x19, x7, lsr #25 - usra v11.2D, v19.2D, #21 + usra v11.2d, v19.2d, #21 bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 - trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + trn1 v17.4s, v13.4s, v27.4s // FINAL z3 add x19, x2, x29, lsr #26 - trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + trn1 v19.4s, v21.4s, v28.4s // FINAL z3 and x3, x29, #0x3ffffff mov v16.d[0], v6.d[1] // FINAL x3 mov v6.d[0], v17.d[1] // FINAL x2 - trn1 v8.4S, v8.4S, v9.4S + trn1 v8.4s, v8.4s, v9.4s bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 - and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v11.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 mov v18.d[0], v8.d[1] // FINAL x3 mov v8.d[0], v19.d[1] // FINAL x2 - umlal v25.2D, v9.2S, v12.2S + umlal v25.2d, v9.2s, v12.2s mov v9.d[0], x23 // FINAL z2 mov v7.d[0], x25 // FINAL z2 ldr d29, [mask1] mov v12.d[0], v2.d[1] // FINAL x3 - trn1 v4.4S, v4.4S, v5.4S + trn1 v4.4s, v4.4s, v5.4s and x17, x17, #0x3ffffff - usra v25.2D, v11.2D, #26 + usra v25.2d, v11.2d, #26 mov v10.d[0], v0.d[1] // FINAL x3 mov v14.d[0], v4.d[1] // FINAL x3 mov v4.d[0], v15.d[1] // FINAL x2 - usra v20.2D, v25.2D, #25 - and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + usra v20.2d, v25.2d, #25 + and v27.16b, v25.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 mov v5.d[0], x3 // depth 86 mov v1.d[0], x5 // FINAL z2 - usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 - and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 - trn1 v11.4S, v21.4S, v27.4S // FINAL z3 - trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + usra v26.2d, v20.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v20.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4s, v21.4s, v27.4s // FINAL z3 + trn1 v13.4s, v28.4s, v26.4s // FINAL z3 mov v0.d[0], v11.d[1] // FINAL x2 mov v3.d[0], x17 // FINAL z2 mov v2.d[0], v13.d[1] // FINAL x2 diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 89f2f44f4e..73c27db9f8 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -284,12 +284,12 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #(1<<26)-1 + mov x0, #67108863 // #(1<<26)-1 mov v30.d[0], x0 mov v30.d[1], x0 mov x0, #0x07fffffe07fffffe - sub x1, x0, #0xfe-0xb4 + sub x1, x0, #74 // #0xfe-0xb4 sub x0, x0, #2 stp x0, x1, [mask1] @@ -359,35 +359,35 @@ curve25519_x25519_byte_scalarloop: // (x2',z2') = (x4,z4) // (x3',z3') = (x5,z5) - add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a - sub v21.2S, v28.2S, v1.2S - add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a - sub v24.2S, v29.2S, v3.2S - add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c - add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b - sub v20.2S, v29.2S, v15.2S - sub v1.2S, v29.2S, v5.2S - sub v26.2S, v28.2S, v11.2S - sub v21.2S, v29.2S, v19.2S - add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c - add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d - add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d - sub v20.2S, v29.2S, v17.2S - add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b - add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c - add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c - add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d - add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c - add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d - sub v10.2S, v29.2S, v13.2S - sub v13.2S, v29.2S, v7.2S - add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a - sub v7.2S, v29.2S, v9.2S - add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + add v22.2s, v2.2s, v3.2s // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2s, v28.2s, v1.2s + add v25.2s, v0.2s, v1.2s // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2s, v29.2s, v3.2s + add v3.2s, v18.2s, v19.2s // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2s, v0.2s, v21.2s // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2s, v29.2s, v15.2s + sub v1.2s, v29.2s, v5.2s + sub v26.2s, v28.2s, v11.2s + sub v21.2s, v29.2s, v19.2s + add v19.2s, v10.2s, v11.2s // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2s, v14.2s, v20.2s // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2s, v18.2s, v21.2s // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2s, v29.2s, v17.2s + add v18.2s, v2.2s, v24.2s // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2s, v14.2s, v15.2s // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2s, v16.2s, v17.2s // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2s, v16.2s, v20.2s // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2s, v12.2s, v13.2s // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2s, v10.2s, v26.2s // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2s, v29.2s, v13.2s + sub v13.2s, v29.2s, v7.2s + add v23.2s, v6.2s, v7.2s // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2s, v29.2s, v9.2s + add v27.2s, v12.2s, v10.2s // ubignum_of_qreglist 1 // INTERMEDIATE d fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f - add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + add v28.2s, v4.2s, v5.2s // ubignum_of_qreglist 2 // INTERMEDIATE a fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f - add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + add v7.2s, v8.2s, v7.2s // ubignum_of_qreglist 4 // INTERMEDIATE b fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f mov x0, v20.d[0] fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f @@ -397,589 +397,589 @@ curve25519_x25519_byte_scalarloop: lsr x26, x0, #32 add x29, x21, x21 umull x15, w5, w29 - add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add v13.2s, v6.2s, v13.2s // ubignum_of_qreglist 3 // INTERMEDIATE b add x12, x26, x26 mov x30, v5.d[0] fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g lsr x11, x5, #32 lsr x10, x30, #32 - trn2 v20.2S, v21.2S, v3.2S - add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + trn2 v20.2s, v21.2s, v3.2s + add v9.2s, v8.2s, v9.2s // ubignum_of_qreglist 4 // INTERMEDIATE a add x14, x11, x11 - trn2 v6.2S, v2.2S, v15.2S - trn1 v12.2S, v25.2S, v0.2S - add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b - trn1 v16.2S, v23.2S, v13.2S + trn2 v6.2s, v2.2s, v15.2s + trn1 v12.2s, v25.2s, v0.2s + add v1.2s, v4.2s, v1.2s // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2s, v23.2s, v13.2s fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g - trn2 v17.2S, v27.2S, v24.2S + trn2 v17.2s, v27.2s, v24.2s str d29, [tmpb+32] add x17, x10, x10 - trn2 v4.2S, v28.2S, v1.2S - trn1 v5.2S, v28.2S, v1.2S - trn1 v28.2S, v2.2S, v15.2S - trn1 v2.2S, v22.2S, v18.2S + trn2 v4.2s, v28.2s, v1.2s + trn1 v5.2s, v28.2s, v1.2s + trn1 v28.2s, v2.2s, v15.2s + trn1 v2.2s, v22.2s, v18.2s fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g - trn2 v15.2S, v22.2S, v18.2S - umull v22.2D, v12.2S, v20.2S + trn2 v15.2s, v22.2s, v18.2s + umull v22.2d, v12.2s, v20.2s umull x22, w30, w17 stp d29, d10, [tmpb+0] - trn2 v10.2S, v23.2S, v13.2S - trn2 v23.2S, v11.2S, v14.2S - trn1 v13.2S, v27.2S, v24.2S + trn2 v10.2s, v23.2s, v13.2s + trn2 v23.2s, v11.2s, v14.2s + trn1 v13.2s, v27.2s, v24.2s fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g - trn1 v14.2S, v11.2S, v14.2S - umlal v22.2D, v2.2S, v6.2S + trn1 v14.2s, v11.2s, v14.2s + umlal v22.2d, v2.2s, v6.2s umull x25, w30, w30 - umlal v22.2D, v5.2S, v23.2S + umlal v22.2d, v5.2s, v23.2s add x3, x30, x30 - umlal v22.2D, v16.2S, v17.2S + umlal v22.2d, v16.2s, v17.2s add w30, w21, w21, lsl #1; stp d27, d8, [tmpb+16] add w30, w30, w21, lsl #4 - trn1 v11.2S, v26.2S, v19.2S - trn2 v8.2S, v26.2S, v19.2S - trn2 v19.2S, v25.2S, v0.2S - mul v29.2S, v20.2S, v31.2S + trn1 v11.2s, v26.2s, v19.2s + trn2 v8.2s, v26.2s, v19.2s + trn2 v19.2s, v25.2s, v0.2s + mul v29.2s, v20.2s, v31.2s ldr x20, [tmpb+24] - umull v25.2D, v19.2S, v6.2S + umull v25.2d, v19.2s, v6.2s add x1, x0, x0 - umull v27.2D, v19.2S, v23.2S + umull v27.2d, v19.2s, v23.2s umull x9, w5, w1 - umull v0.2D, v12.2S, v23.2S + umull v0.2d, v12.2s, v23.2s lsr x24, x20, #32 - mul v20.2S, v23.2S, v31.2S + mul v20.2s, v23.2s, v31.2s lsr x16, x21, #32 - umlal v25.2D, v15.2S, v23.2S + umlal v25.2d, v15.2s, v23.2s umaddl x13, w11, w14, x9 - umlal v25.2D, v4.2S, v17.2S + umlal v25.2d, v4.2s, v17.2s umaddl x9, w14, w17, x15 - umull v24.2D, v12.2S, v6.2S + umull v24.2d, v12.2s, v6.2s add w2, w16, w16, lsl #1; fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f add w2, w2, w16, lsl #4 - trn1 v18.2S, v21.2S, v3.2S - umull v3.2D, v19.2S, v29.2S + trn1 v18.2s, v21.2s, v3.2s + umull v3.2d, v19.2s, v29.2s umull x28, w5, w3 - mul v1.2S, v6.2S, v31.2S + mul v1.2s, v6.2s, v31.2s umull x8, w5, w5 - umlal v24.2D, v2.2S, v23.2S + umlal v24.2d, v2.2s, v23.2s umaddl x13, w21, w30, x13 - mul v23.2S, v17.2S, v31.2S + mul v23.2s, v17.2s, v31.2s umaddl x27, w14, w12, x28 - trn2 v6.2S, v9.2S, v7.2S + trn2 v6.2s, v9.2s, v7.2s mov x6, v26.d[0] - umlal v3.2D, v15.2S, v1.2S + umlal v3.2d, v15.2s, v1.2s add x16, x16, x16 - umlal v3.2D, v4.2S, v20.2S + umlal v3.2d, v4.2s, v20.2s lsr x4, x6, #32 - umlal v3.2D, v10.2S, v23.2S + umlal v3.2d, v10.2s, v23.2s add x7, x6, x6 - umull v26.2D, v19.2S, v8.2S + umull v26.2d, v19.2s, v8.2s add x23, x4, x4 umaddl x28, w5, w23, x22 - trn1 v7.2S, v9.2S, v7.2S - umlal v27.2D, v15.2S, v17.2S + trn1 v7.2s, v9.2s, v7.2s + umlal v27.2d, v15.2s, v17.2s add w15, w4, w4, lsl #1; - umlal v27.2D, v4.2S, v8.2S + umlal v27.2d, v4.2s, v8.2s add w15, w15, w4, lsl #4 add w22, w10, w10, lsl #1; - umlal v24.2D, v5.2S, v17.2S + umlal v24.2d, v5.2s, v17.2s add w22, w22, w10, lsl #4 umaddl x10, w11, w7, x28 - umlal v25.2D, v10.2S, v8.2S + umlal v25.2d, v10.2s, v8.2s umull x21, w5, w16 - umlal v25.2D, v6.2S, v29.2S + umlal v25.2d, v6.2s, v29.2s umaddl x23, w15, w23, x25 - umlal v27.2D, v10.2S, v29.2S + umlal v27.2d, v10.2s, v29.2s umull x19, w5, w12 - umlal v27.2D, v6.2S, v1.2S + umlal v27.2d, v6.2s, v1.2s umaddl x25, w11, w29, x21 - umlal v0.2D, v2.2S, v17.2S + umlal v0.2d, v2.2s, v17.2s umaddl x28, w0, w3, x9 - shl v21.2D, v25.2D, #1 + shl v21.2d, v25.2d, #1 umaddl x4, w11, w1, x19 umaddl x21, w2, w29, x4 - mul v25.2S, v8.2S, v31.2S - umlal v24.2D, v16.2S, v8.2S + mul v25.2s, v8.2s, v31.2s + umlal v24.2d, v16.2s, v8.2s umaddl x19, w0, w17, x25 - umlal v24.2D, v7.2S, v29.2S + umlal v24.2d, v7.2s, v29.2s umull x25, w5, w17 - umlal v24.2D, v19.2S, v28.2S + umlal v24.2d, v19.2s, v28.2s umaddl x4, w0, w16, x10 - umull v9.2D, v12.2S, v8.2S + umull v9.2d, v12.2s, v8.2s umaddl x23, w5, w7, x23 - umlal v21.2D, v12.2S, v18.2S + umlal v21.2d, v12.2s, v18.2s add w10, w6, w6, lsl #1; - shl v27.2D, v27.2D, #1 + shl v27.2d, v27.2d, #1 add w10, w10, w6, lsl #4 umaddl x28, w26, w12, x28 - umlal v26.2D, v15.2S, v29.2S + umlal v26.2d, v15.2s, v29.2s umaddl x9, w14, w16, x23 - umlal v9.2D, v2.2S, v29.2S + umlal v9.2d, v2.2s, v29.2s umaddl x22, w22, w17, x8 - umlal v21.2D, v2.2S, v28.2S + umlal v21.2d, v2.2s, v28.2s umaddl x28, w6, w10, x28 umaddl x27, w0, w0, x27 add x8, x14, x14 - umlal v0.2D, v5.2S, v8.2S + umlal v0.2d, v5.2s, v8.2s umull x5, w5, w14 - umlal v9.2D, v5.2S, v1.2S + umlal v9.2d, v5.2s, v1.2s umaddl x14, w0, w29, x9 - umlal v26.2D, v4.2S, v1.2S + umlal v26.2d, v4.2s, v1.2s umaddl x6, w2, w16, x27 - umlal v22.2D, v7.2S, v8.2S + umlal v22.2d, v7.2s, v8.2s umaddl x5, w30, w17, x5 umaddl x5, w2, w3, x5 add x23, x17, x17 - umlal v27.2D, v12.2S, v28.2S + umlal v27.2d, v12.2s, v28.2s umaddl x13, w2, w23, x13 - umlal v26.2D, v10.2S, v20.2S + umlal v26.2d, v10.2s, v20.2s add x9, x12, x12 - umlal v9.2D, v16.2S, v20.2S + umlal v9.2d, v16.2s, v20.2s umaddl x27, w10, w29, x6 - umlal v0.2D, v16.2S, v29.2S + umlal v0.2d, v16.2s, v29.2s umaddl x6, w11, w3, x25 - umlal v22.2D, v19.2S, v18.2S + umlal v22.2d, v19.2s, v18.2s umaddl x19, w26, w3, x19 - mul v18.2S, v18.2S, v31.2S + mul v18.2s, v18.2s, v31.2s umaddl x23, w15, w23, x27 - umlal v3.2D, v6.2S, v25.2S + umlal v3.2d, v6.2s, v25.2s umaddl x0, w0, w12, x6 - umlal v0.2D, v7.2S, v1.2S + umlal v0.2d, v7.2s, v1.2s add x11, x16, x16 - umlal v9.2D, v7.2S, v23.2S + umlal v9.2d, v7.2s, v23.2s umaddl x6, w12, w17, x14 - umlal v9.2D, v19.2S, v11.2S + umlal v9.2d, v19.2s, v11.2s umaddl x25, w26, w29, x4 - umlal v9.2D, v15.2S, v18.2S + umlal v9.2d, v15.2s, v18.2s umaddl x14, w10, w3, x13 - umull v25.2D, v12.2S, v17.2S + umull v25.2d, v12.2s, v17.2s umaddl x27, w10, w16, x0 - umlal v26.2D, v6.2S, v23.2S + umlal v26.2d, v6.2s, v23.2s add x0, x25, x6, lsr #26 - mul v23.2S, v28.2S, v31.2S + mul v23.2s, v28.2s, v31.2s umaddl x12, w10, w12, x5 - shl v3.2D, v3.2D, #1 + shl v3.2d, v3.2d, #1 add x16, x22, x0, lsr #25 - umlal v21.2D, v5.2S, v14.2S + umlal v21.2d, v5.2s, v14.2s bic x22, x0, #0x1ffffff - umlal v3.2D, v12.2S, v11.2S + umlal v3.2d, v12.2s, v11.2s add x26, x16, x22, lsr #24 - umlal v3.2D, v2.2S, v18.2S + umlal v3.2d, v2.2s, v18.2s umaddl x16, w10, w17, x21 - umlal v3.2D, v5.2S, v23.2S + umlal v3.2d, v5.2s, v23.2s add x22, x26, x22, lsr #21 - umlal v9.2D, v4.2S, v23.2S + umlal v9.2d, v4.2s, v23.2s umaddl x5, w15, w29, x27 - umull v17.2D, v19.2S, v17.2S + umull v17.2d, v19.2s, v17.2s umaddl x17, w30, w3, x22 - umlal v25.2D, v2.2S, v8.2S + umlal v25.2d, v2.2s, v8.2s umaddl x25, w15, w3, x16 - umlal v25.2D, v5.2S, v29.2S + umlal v25.2d, v5.2s, v29.2s umaddl x26, w15, w7, x19 - umlal v0.2D, v19.2S, v14.2S + umlal v0.2d, v19.2s, v14.2s umaddl x17, w2, w9, x17 - umlal v17.2D, v15.2S, v8.2S + umlal v17.2d, v15.2s, v8.2s ldr x19, [tmpb+0] - umlal v17.2D, v4.2S, v29.2S + umlal v17.2d, v4.2s, v29.2s ldr x7, [tmpb+8] - shl v29.2D, v26.2D, #1 + shl v29.2d, v26.2d, #1 umaddl x13, w10, w1, x17 - umlal v0.2D, v15.2S, v13.2S + umlal v0.2d, v15.2s, v13.2s lsr x2, x19, #32 - umlal v29.2D, v12.2S, v13.2S + umlal v29.2d, v12.2s, v13.2s umaddl x27, w15, w1, x12 - umlal v29.2D, v2.2S, v11.2S + umlal v29.2d, v2.2s, v11.2s umaddl x30, w15, w8, x13 - umlal v29.2D, v5.2S, v18.2S + umlal v29.2d, v5.2s, v18.2s add x4, x7, x7 - umlal v29.2D, v16.2S, v23.2S + umlal v29.2d, v16.2s, v23.2s umaddl x29, w15, w9, x14 - umlal v0.2D, v4.2S, v11.2S + umlal v0.2d, v4.2s, v11.2s add x17, x27, x30, lsr #26 - umlal v0.2D, v10.2S, v18.2S + umlal v0.2d, v10.2s, v18.2s umaddl x16, w15, w11, x28 - umlal v0.2D, v6.2S, v23.2S + umlal v0.2d, v6.2s, v23.2s add x1, x29, x17, lsr #25 - umlal v25.2D, v16.2S, v1.2S + umlal v25.2d, v16.2s, v1.2s umull x11, w19, w4 ldr x8, [tmpb+32] - mul v26.2S, v14.2S, v31.2S - umlal v17.2D, v10.2S, v1.2S + mul v26.2s, v14.2s, v31.2s + umlal v17.2d, v10.2s, v1.2s ldr x15, [tmpb+16] - umlal v17.2D, v6.2S, v20.2S + umlal v17.2d, v6.2s, v20.2s and x9, x30, #0x3ffffff bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa add x17, x2, x2 lsr x10, x15, #32 add x27, x25, x1, lsr #26 - umlal v25.2D, v7.2S, v20.2S + umlal v25.2d, v7.2s, v20.2s add x13, x10, x10 - umlal v25.2D, v19.2S, v13.2S + umlal v25.2d, v19.2s, v13.2s add x29, x23, x27, lsr #25 - umlal v25.2D, v15.2S, v11.2S + umlal v25.2d, v15.2s, v11.2s lsr x30, x8, #32 - umlal v25.2D, v4.2S, v18.2S + umlal v25.2d, v4.2s, v18.2s add x23, x5, x29, lsr #26 - umlal v25.2D, v10.2S, v23.2S + umlal v25.2d, v10.2s, v23.2s and x14, x29, #0x3ffffff - umlal v25.2D, v6.2S, v26.2S + umlal v25.2d, v6.2s, v26.2s add x5, x16, x23, lsr #25 - shl v8.2D, v17.2D, #1 + shl v8.2d, v17.2d, #1 umaddl x12, w2, w17, x11 and x29, x5, #0x3ffffff umull x21, w19, w19 - umlal v29.2D, v7.2S, v26.2S + umlal v29.2d, v7.2s, v26.2s add w16, w10, w10, lsl #1; - umlal v3.2D, v16.2S, v26.2S + umlal v3.2d, v16.2s, v26.2s add w16, w16, w10, lsl #4 bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa add w10, w24, w24, lsl #1; add x22, x26, x5, lsr #26 add w10, w10, w24, lsl #4 - umlal v8.2D, v12.2S, v14.2S + umlal v8.2d, v12.2s, v14.2s umaddl x25, w16, w13, x21 - umlal v8.2D, v2.2S, v13.2S + umlal v8.2d, v2.2s, v13.2s bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa - umlal v8.2D, v5.2S, v11.2S + umlal v8.2d, v5.2s, v11.2s add x26, x24, x24 - umlal v8.2D, v16.2S, v18.2S + umlal v8.2d, v16.2s, v18.2s stp x14, x29, [tmpa+16] - umlal v8.2D, v7.2S, v23.2S + umlal v8.2d, v7.2s, v23.2s add w24, w30, w30, lsl #1; - usra v25.2D, v29.2D, #26 + usra v25.2d, v29.2d, #26 add w24, w24, w30, lsl #4 umull x29, w15, w15 - umlal v27.2D, v2.2S, v14.2S + umlal v27.2d, v2.2s, v14.2s umull x3, w15, w13 - umlal v27.2D, v5.2S, v13.2S + umlal v27.2d, v5.2s, v13.2s add x21, x20, x20 - umlal v24.2D, v15.2S, v14.2S + umlal v24.2d, v15.2s, v14.2s umull x5, w19, w21 - umlal v24.2D, v4.2S, v13.2S + umlal v24.2d, v4.2s, v13.2s and x11, x1, #0x3ffffff - usra v8.2D, v25.2D, #25 + usra v8.2d, v25.2d, #25 and x1, x0, #0x1ffffff - umlal v27.2D, v16.2S, v11.2S + umlal v27.2d, v16.2s, v11.2s umaddl x23, w17, w13, x5 - umlal v27.2D, v7.2S, v18.2S + umlal v27.2d, v7.2s, v18.2s add x5, x30, x30 - usra v0.2D, v8.2D, #26 + usra v0.2d, v8.2d, #26 add x0, x15, x15 - umlal v24.2D, v10.2S, v11.2S + umlal v24.2d, v10.2s, v11.2s umaddl x23, w7, w0, x23 - umlal v24.2D, v6.2S, v18.2S + umlal v24.2d, v6.2s, v18.2s lsr x30, x7, #32 - usra v27.2D, v0.2D, #25 + usra v27.2d, v0.2d, #25 add x16, x30, x30 - and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + and v20.16b, v8.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad umaddl x15, w30, w16, x23 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 add w23, w8, w8, lsl #1; - usra v24.2D, v27.2D, #26 + usra v24.2d, v27.2d, #26 add w23, w23, w8, lsl #4 umaddl x14, w19, w5, x3 - and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + and v8.16b, v27.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad add x28, x8, x8 - and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + and v27.16b, v0.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad umaddl x8, w8, w23, x15 - and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + and v5.16b, v24.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad umaddl x3, w2, w28, x14 - umlal v22.2D, v15.2S, v28.2S + umlal v22.2d, v15.2s, v28.2s bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa - uzp1 v5.4S, v8.4S, v5.4S + uzp1 v5.4s, v8.4s, v5.4s umaddl x14, w24, w5, x29 umaddl x5, w19, w28, x14 ldr d18, [mask1] mov v18.d[1], v18.d[0] umaddl x15, w7, w26, x3 - mul v12.2S, v13.2S, v31.2S - umlal v21.2D, v16.2S, v13.2S + mul v12.2s, v13.2s, v31.2s + umlal v21.2d, v16.2s, v13.2s stp x9, x11, [tmpa+0] - umlal v21.2D, v7.2S, v11.2S + umlal v21.2d, v7.2s, v11.2s umaddl x29, w17, w26, x5 - umlal v22.2D, v4.2S, v14.2S + umlal v22.2d, v4.2s, v14.2s add w14, w20, w20, lsl #1; - umlal v22.2D, v10.2S, v13.2S + umlal v22.2d, v10.2s, v13.2s add w14, w14, w20, lsl #4 umull x3, w19, w0 - umlal v22.2D, v6.2S, v11.2S + umlal v22.2d, v6.2s, v11.2s umaddl x29, w7, w21, x29 - usra v21.2D, v24.2D, #25 + usra v21.2d, v24.2d, #25 umaddl x11, w20, w14, x12 - and v0.16B, v25.16B, v23.16B + and v0.16b, v25.16b, v23.16b umaddl x5, w30, w21, x15 - and v14.16B, v29.16B, v30.16B + and v14.16b, v29.16b, v30.16b umaddl x12, w16, w13, x29 - usra v22.2D, v21.2D, #26 + usra v22.2d, v21.2d, #26 umaddl x29, w17, w16, x3 - umlal v3.2D, v7.2S, v12.2S + umlal v3.2d, v7.2s, v12.2s add x9, x26, x26 - and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + and v1.16b, v21.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad add x27, x5, x12, lsr #26 - bic v8.16B, v22.16B, v23.16B + bic v8.16b, v22.16b, v23.16b umaddl x29, w7, w7, x29 - and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + and v17.16b, v22.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad add x5, x25, x27, lsr #25 - usra v3.2D, v8.2D, #25 + usra v3.2d, v8.2d, #25 umaddl x25, w24, w9, x8 - umlal v9.2D, v10.2S, v26.2S + umlal v9.2d, v10.2s, v26.2s add x8, x13, x13 - trn1 v22.4S, v1.4S, v17.4S + trn1 v22.4s, v1.4s, v17.4s umaddl x11, w10, w8, x11 - usra v3.2D, v8.2D, #24 + usra v3.2d, v8.2d, #24 umull x20, w19, w16 - add v26.2S, v22.2S, v18.2S + add v26.2s, v22.2s, v18.2s ldr d28, [mask2] - umlal v9.2D, v6.2S, v12.2S + umlal v9.2d, v6.2s, v12.2s umaddl x3, w23, w0, x11 - usra v3.2D, v8.2D, #21 + usra v3.2d, v8.2d, #21 umaddl x29, w10, w26, x29 - uzp1 v11.4S, v20.4S, v27.4S + uzp1 v11.4s, v20.4s, v27.4s umaddl x20, w2, w4, x20 umaddl x9, w10, w21, x20 mov v17.d[0], v22.d[1] - usra v9.2D, v3.2D, #26 + usra v9.2d, v3.2d, #26 umull x15, w19, w13 - and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + and v7.16b, v3.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad add x11, x16, x16 - uzp2 v1.4S, v11.4S, v5.4S + uzp2 v1.4s, v11.4s, v5.4s umaddl x20, w23, w13, x9 - and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + and v8.16b, v9.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad umaddl x9, w2, w0, x15 - usra v14.2D, v9.2D, #25 + usra v14.2d, v9.2d, #25 and x6, x6, #0x3ffffff - uzp1 v7.4S, v7.4S, v8.4S + uzp1 v7.4s, v7.4s, v8.4s umaddl x29, w23, w21, x29 - uzp1 v27.4S, v11.4S, v5.4S + uzp1 v27.4s, v11.4s, v5.4s umull x15, w19, w26 - usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + usra v0.2d, v14.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad add x6, x6, x22, lsr #25 - and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + and v3.16b, v14.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad bic x22, x27, #0x1ffffff - sub v2.2S, v26.2S, v17.2S - add v9.2S, v22.2S, v17.2S - uzp1 v14.4S, v3.4S, v0.4S + sub v2.2s, v26.2s, v17.2s + add v9.2s, v22.2s, v17.2s + uzp1 v14.4s, v3.4s, v0.4s umaddl x2, w2, w21, x15 - add v5.4S, v27.4S, v18.4S + add v5.4s, v27.4s, v18.4s add x5, x5, x22, lsr #24 - zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + zip1 v22.2s, v2.2s, v9.2s // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 mov v18.b[0], v28.b[0] - uzp1 v8.4S, v7.4S, v14.4S + uzp1 v8.4s, v7.4s, v14.4s add x22, x5, x22, lsr #21 - uzp2 v3.4S, v7.4S, v14.4S + uzp2 v3.4s, v7.4s, v14.4s umaddl x5, w7, w16, x9 - add v25.4S, v8.4S, v18.4S + add v25.4s, v8.4s, v18.4s umaddl x15, w14, w0, x22 - add v12.4S, v27.4S, v1.4S + add v12.4s, v27.4s, v1.4s add x9, x17, x17 - sub v14.4S, v5.4S, v1.4S + sub v14.4s, v5.4s, v1.4s umull x19, w19, w17 - sub v18.4S, v25.4S, v3.4S + sub v18.4s, v25.4s, v3.4s ldr x22, [tmpa+8] - add v20.4S, v8.4S, v3.4S + add v20.4s, v8.4s, v3.4s umaddl x15, w10, w11, x15 - zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + zip1 v16.4s, v14.4s, v12.4s // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 umaddl x14, w14, w13, x19 - zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + zip2 v14.4s, v14.4s, v12.4s // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 and x17, x27, #0x1ffffff - zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + zip2 v0.4s, v18.4s, v20.4s // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 umaddl x15, w23, w4, x15 - zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + zip1 v1.4s, v18.4s, v20.4s // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 umaddl x10, w10, w0, x14 - zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 - shl v24.2S, v0.2S, #1 + zip2 v5.2s, v2.2s, v9.2s // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2s, v0.2s, #1 mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 - shl v26.2S, v22.2S, #1 - shl v17.2S, v16.2S, #1 + shl v26.2s, v22.2s, #1 + shl v17.2s, v16.2s, #1 mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 - shl v7.2S, v5.2S, #1 - shl v18.2S, v19.2S, #1 - umull v11.2D, v1.2S, v24.2S + shl v7.2s, v5.2s, #1 + shl v18.2s, v19.2s, #1 + umull v11.2d, v1.2s, v24.2s umaddl x19, w23, w16, x10 - umull v6.2D, v1.2S, v17.2S + umull v6.2d, v1.2s, v17.2s umaddl x10, w7, w13, x2 mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 - umull v9.2D, v1.2S, v26.2S + umull v9.2d, v1.2s, v26.2s ldr x13, [tmpa+0] - shl v28.2S, v15.2S, #1 - shl v3.2S, v10.2S, #1 + shl v28.2s, v15.2s, #1 + shl v3.2s, v10.2s, #1 ldr x14, [tmpa+16] - mul v12.2S, v10.2S, v31.2S - umull v25.2D, v1.2S, v7.2S + mul v12.2s, v10.2s, v31.2s + umull v25.2d, v1.2s, v7.2s ldr x2, [tmpa+24] - umlal v6.2D, v18.2S, v28.2S + umlal v6.2d, v18.2s, v28.2s umaddl x27, w30, w0, x10 umaddl x16, w24, w0, x20 - shl v13.2S, v14.2S, #1 + shl v13.2s, v14.2s, #1 umaddl x5, w23, w26, x5 - mul v2.2S, v22.2S, v31.2S - umull v21.2D, v1.2S, v13.2S + mul v2.2s, v22.2s, v31.2s + umull v21.2d, v1.2s, v13.2s umaddl x23, w24, w8, x29 - umlal v11.2D, v18.2S, v19.2S + umlal v11.2d, v18.2s, v19.2s mov x10, #0x07fffffe07fffffe sub x10, x10, #2 umaddl x26, w24, w21, x5 - mul v29.2S, v14.2S, v31.2S - umlal v25.2D, v19.2S, v26.2S + mul v29.2s, v14.2s, v31.2s + umlal v25.2d, v19.2s, v26.2s add x7, x1, x6, lsr #26 - mul v20.2S, v4.2S, v31.2S + mul v20.2s, v4.2s, v31.2s and x6, x6, #0x3ffffff - shl v8.2S, v18.2S, #1 - shl v4.2S, v4.2S, #1 - umlal v11.2D, v29.2S, v14.2S + shl v8.2s, v18.2s, #1 + shl v4.2s, v4.2s, #1 + umlal v11.2d, v29.2s, v14.2s bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa - umlal v25.2D, v0.2S, v3.2S + umlal v25.2d, v0.2s, v3.2s umaddl x0, w24, w4, x19 - umlal v25.2D, v15.2S, v13.2S + umlal v25.2d, v15.2s, v13.2s str x6, [tmpa+32] - umlal v21.2D, v18.2S, v4.2S + umlal v21.2d, v18.2s, v4.2s umaddl x8, w24, w11, x3 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s ldr x30, [tmpa+32] - mul v14.2S, v5.2S, v31.2S + mul v14.2s, v5.2s, v31.2s add x2, x2, x10 - shl v5.2S, v28.2S, #1 - shl v27.2S, v4.2S, #1 - umlal v6.2D, v0.2S, v0.2S + shl v5.2s, v28.2s, #1 + shl v27.2s, v4.2s, #1 + umlal v6.2d, v0.2s, v0.2s umaddl x11, w24, w9, x15 - umlal v6.2D, v12.2S, v3.2S + umlal v6.2d, v12.2s, v3.2s add x4, x30, x10 - umlal v11.2D, v14.2S, v5.2S + umlal v11.2d, v14.2s, v5.2s add x3, x22, x10 - umlal v11.2D, v2.2S, v17.2S + umlal v11.2d, v2.2s, v17.2s add x6, x0, x11, lsr #26 - umlal v11.2D, v12.2S, v27.2S + umlal v11.2d, v12.2s, v27.2s add x14, x14, x10 - umlal v6.2D, v14.2S, v27.2S + umlal v6.2d, v14.2s, v27.2s add x8, x8, x6, lsr #25 - umlal v6.2D, v2.2S, v13.2S + umlal v6.2d, v2.2s, v13.2s movk x10, #0xffb4 - umlal v25.2D, v16.2S, v4.2S + umlal v25.2d, v16.2s, v4.2s add x29, x16, x8, lsr #26 - umull v27.2D, v1.2S, v3.2S + umull v27.2d, v1.2s, v3.2s and x11, x11, #0x3ffffff - umlal v9.2D, v18.2S, v3.2S + umlal v9.2d, v18.2s, v3.2s add x19, x13, x10 - umlal v9.2D, v0.2S, v13.2S + umlal v9.2d, v0.2s, v13.2s and x5, x8, #0x3ffffff - umlal v9.2D, v28.2S, v4.2S + umlal v9.2d, v28.2s, v4.2s bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb - umlal v9.2D, v16.2S, v16.2S + umlal v9.2d, v16.2s, v16.2s umaddl x30, w24, w28, x27 - umlal v9.2D, v14.2S, v7.2S + umlal v9.2d, v14.2s, v7.2s sub x13, x19, x11 - umull v10.2D, v1.2S, v18.2S + umull v10.2d, v1.2s, v18.2s add x7, x23, x29, lsr #25 - umlal v21.2D, v28.2S, v15.2S + umlal v21.2d, v28.2s, v15.2s lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e - umlal v21.2D, v2.2S, v22.2S + umlal v21.2d, v2.2s, v22.2s add x0, x26, x7, lsr #26 - usra v25.2D, v9.2D, #26 + usra v25.2d, v9.2d, #26 and x20, x7, #0x3ffffff - umull v22.2D, v1.2S, v1.2S + umull v22.2d, v1.2s, v1.2s add x8, x25, x0, lsr #25 - umull v7.2D, v1.2S, v28.2S + umull v7.2d, v1.2s, v28.2s and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt - bic v18.16B, v25.16B, v23.16B + bic v18.16b, v25.16b, v23.16b and x19, x8, #0x3ffffff - and v16.16B, v9.16B, v30.16B + and v16.16b, v9.16b, v30.16b and x7, x12, #0x3ffffff - usra v22.2D, v18.2D, #25 + usra v22.2d, v18.2d, #25 add x10, x30, x8, lsr #26 - umlal v7.2D, v19.2S, v24.2S + umlal v7.2d, v19.2s, v24.2s bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb - and v9.16B, v25.16B, v23.16B + and v9.16b, v25.16b, v23.16b add x27, x7, x10, lsr #25 - usra v22.2D, v18.2D, #24 + usra v22.2d, v18.2d, #24 mov x21, #60833 lsl x21, x21, #1 add x15, x17, x27, lsr #26 - shl v25.2S, v3.2S, #1 - umlal v7.2D, v14.2S, v17.2S + shl v25.2s, v3.2s, #1 + umlal v7.2d, v14.2s, v17.2s and x29, x27, #0x3ffffff - usra v22.2D, v18.2D, #21 + usra v22.2d, v18.2d, #21 bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt - umlal v10.2D, v14.2S, v24.2S + umlal v10.2d, v14.2s, v24.2s and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt - umlal v10.2D, v2.2S, v28.2S + umlal v10.2d, v2.2s, v28.2s sub x6, x3, x5 - umlal v10.2D, v12.2S, v17.2S + umlal v10.2d, v12.2s, v17.2s umaddl x25, w16, w21, x17 - umlal v10.2D, v29.2S, v4.2S + umlal v10.2d, v29.2s, v4.2s mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt - umlal v22.2D, v20.2S, v4.2S + umlal v22.2d, v20.2s, v4.2s lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e - umlal v22.2D, v14.2S, v8.2S + umlal v22.2d, v14.2s, v8.2s and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt - umlal v22.2D, v2.2S, v24.2S + umlal v22.2d, v2.2s, v24.2s stp x11, x5, [tmpb+0] - umlal v22.2D, v12.2S, v5.2S + umlal v22.2d, v12.2s, v5.2s bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb - umlal v22.2D, v29.2S, v17.2S + umlal v22.2d, v29.2s, v17.2s umaddl x12, w6, w21, x12 - umull v18.2D, v1.2S, v4.2S + umull v18.2d, v1.2s, v4.2s bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb - umlal v7.2D, v2.2S, v4.2S + umlal v7.2d, v2.2s, v4.2s sub x7, x14, x20 - umlal v27.2D, v19.2S, v13.2S + umlal v27.2d, v19.2s, v13.2s mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt - usra v10.2D, v22.2D, #26 + usra v10.2d, v22.2d, #26 lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e - umlal v18.2D, v19.2S, v17.2S + umlal v18.2d, v19.2s, v17.2s and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt - umlal v7.2D, v12.2S, v13.2S + umlal v7.2d, v12.2s, v13.2s sub x5, x2, x19 - usra v11.2D, v10.2D, #25 + usra v11.2d, v10.2d, #25 mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt - umlal v27.2D, v0.2S, v4.2S - umlal v21.2D, v14.2S, v25.2S + umlal v27.2d, v0.2s, v4.2s + umlal v21.2d, v14.2s, v25.2s sub x23, x4, x29 - usra v7.2D, v11.2D, #26 + usra v7.2d, v11.2d, #26 mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt - umlal v18.2D, v0.2S, v28.2S + umlal v18.2d, v0.2s, v28.2s lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e - umlal v27.2D, v15.2S, v17.2S + umlal v27.2d, v15.2s, v17.2s str x29, [tmpb+32] - usra v6.2D, v7.2D, #25 + usra v6.2d, v7.2d, #25 mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt - and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + and v0.16b, v22.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 umaddl x27, w26, w21, x1 - umlal v18.2D, v14.2S, v13.2S + umlal v18.2d, v14.2s, v13.2s umaddl x30, w23, w21, x0 - umlal v18.2D, v2.2S, v3.2S + umlal v18.2d, v2.2s, v3.2s lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e - and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 - and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + and v4.16b, v6.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16b, v10.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 umaddl x4, w14, w21, x24 ldr x0, [tmpa+0] mov v0.s[1], w0 lsr x0, x0, #32 mov v1.s[1], w0 umaddl x9, w7, w21, x8 - usra v18.2D, v6.2D, #26 + usra v18.2d, v6.2d, #26 umaddl x24, w10, w21, x28 - and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + and v3.16b, v7.16b, v23.16b // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 umaddl x8, w22, w21, x15 - umlal v27.2D, v14.2S, v26.2S + umlal v27.2d, v14.2s, v26.2s umaddl x15, w13, w21, x17 - usra v21.2D, v18.2D, #25 + usra v21.2d, v18.2d, #25 stp x20, x19, [tmpb+16] - and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + and v2.16b, v11.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 lsr x29, x8, #25 ldr x3, [tmpb+0] mov v10.s[1], w3 lsr x3, x3, #32 mov v11.s[1], w3 add x17, x15, x29 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 add x28, x17, x29, lsl #1 - and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and v6.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 and x20, x8, #0x1ffffff - and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + and v5.16b, v18.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 add x17, x28, x29, lsl #4 - and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + and v7.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 ldr x3, [tmpb+8] mov v22.s[1], w3 lsr x3, x3, #32 @@ -990,7 +990,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v11.s[0], w15 and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce - usra v16.2D, v27.2D, #25 + usra v16.2d, v27.2d, #25 add x8, x12, x29, lsr #25 ldr x3, [tmpb+16] mov v14.s[1], w3 @@ -1002,7 +1002,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v23.s[0], w15 add x28, x27, x8, lsr #26 - and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + and v8.16b, v16.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 umull x1, w12, w10 ldr x3, [tmpb+24] mov v17.s[1], w3 @@ -1014,7 +1014,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v15.s[0], w15 umaddl x19, w5, w21, x2 - usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + usra v9.2d, v16.2d, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 add x2, x4, x25, lsr #26 ldr x3, [tmpb+32] mov v24.s[1], w3 @@ -1026,7 +1026,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v18.s[0], w15 add x29, x19, x2, lsr #25 - umull v26.2D, v0.2S, v23.2S + umull v26.2d, v0.2s, v23.2s and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce ldr x0, [tmpa+8] mov v2.s[1], w0 @@ -1038,20 +1038,20 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v25.s[0], w15 add x17, x24, x29, lsr #26 - umull v29.2D, v1.2S, v18.2S + umull v29.2d, v1.2s, v18.2s and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce - umull v20.2D, v0.2S, v15.2S + umull v20.2d, v0.2s, v15.2s add x19, x30, x17, lsr #25 and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce - mul v12.2S, v25.2S, v31.2S + mul v12.2s, v25.2s, v31.2s ldr x0, [tmpa+16] mov v4.s[1], w0 lsr x0, x0, #32 mov v5.s[1], w0 add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v26.2D, v2.2S, v11.2S + umlal v26.2d, v2.2s, v11.2s add w28, w3, w3, lsl #1; - umlal v20.2D, v2.2S, v23.2S + umlal v20.2d, v2.2s, v23.2s add w28, w28, w3, lsl #4 umull x8, w12, w5 ldr x0, [tmpa+24] @@ -1059,12 +1059,12 @@ curve25519_x25519_byte_scalarloop: lsr x0, x0, #32 mov v7.s[1], w0 and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce - mul v16.2S, v18.2S, v31.2S + mul v16.2s, v18.2s, v31.2s add w17, w4, w4, lsl #1; - umull v21.2D, v1.2S, v15.2S + umull v21.2d, v1.2s, v15.2s add w17, w17, w4, lsl #4 umaddl x25, w21, w7, x8 - umlal v20.2D, v4.2S, v11.2S + umlal v20.2d, v4.2s, v11.2s add w8, w21, w21, lsl #1; ldr x0, [tmpa+32] add w8, w8, w21, lsl #4 @@ -1072,300 +1072,300 @@ curve25519_x25519_byte_scalarloop: lsr x0, x0, #32 mov v9.s[1], w0 and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce - umlal v29.2D, v3.2S, v15.2S + umlal v29.2d, v3.2s, v15.2s umaddl x24, w2, w6, x25 - umull v13.2D, v0.2S, v25.2S + umull v13.2d, v0.2s, v25.2s umaddl x25, w2, w7, x27 umaddl x0, w3, w6, x25 - mul v19.2S, v15.2S, v31.2S - umull v27.2D, v0.2S, v18.2S + mul v19.2s, v15.2s, v31.2s + umull v27.2d, v0.2s, v18.2s umaddl x20, w3, w13, x24 - umlal v20.2D, v6.2S, v12.2S + umlal v20.2d, v6.2s, v12.2s umaddl x24, w21, w14, x1 - umlal v13.2D, v2.2S, v18.2S + umlal v13.2d, v2.2s, v18.2s umaddl x9, w4, w13, x0 - umull v25.2D, v0.2S, v11.2S + umull v25.2d, v0.2s, v11.2s umaddl x20, w17, w23, x20 - umlal v27.2D, v2.2S, v15.2S + umlal v27.2d, v2.2s, v15.2s umaddl x0, w2, w26, x24 - umull v28.2D, v1.2S, v11.2S + umull v28.2d, v1.2s, v11.2s umull x24, w17, w5 - umlal v29.2D, v5.2S, v23.2S + umlal v29.2d, v5.2s, v23.2s umaddl x9, w11, w22, x9 - umlal v13.2D, v4.2S, v15.2S + umlal v13.2d, v4.2s, v15.2s umaddl x27, w3, w16, x0 - umlal v27.2D, v4.2S, v23.2S + umlal v27.2d, v4.2s, v23.2s umull x0, w17, w14 - umlal v27.2D, v6.2S, v11.2S + umlal v27.2d, v6.2s, v11.2s umull x4, w12, w14 - umlal v27.2D, v8.2S, v12.2S + umlal v27.2d, v8.2s, v12.2s umaddl x25, w11, w10, x20 - umlal v27.2D, v1.2S, v17.2S + umlal v27.2d, v1.2s, v17.2s umaddl x0, w28, w10, x0 - umlal v13.2D, v6.2S, v23.2S + umlal v13.2d, v6.2s, v23.2s umull x3, w17, w6 - umlal v13.2D, v8.2S, v11.2S + umlal v13.2d, v8.2s, v11.2s umaddl x1, w21, w26, x4 - umlal v20.2D, v8.2S, v16.2S + umlal v20.2d, v8.2s, v16.2s umaddl x4, w2, w13, x24 - umlal v28.2D, v3.2S, v12.2S + umlal v28.2d, v3.2s, v12.2s umaddl x20, w28, w7, x3 - umlal v29.2D, v7.2S, v11.2S + umlal v29.2d, v7.2s, v11.2s and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v29.2D, v9.2S, v12.2S + umlal v29.2d, v9.2s, v12.2s umaddl x19, w17, w22, x27 add w27, w2, w2, lsl #1; - mul v18.2S, v24.2S, v31.2S + mul v18.2s, v24.2s, v31.2s add w27, w27, w2, lsl #4 - umlal v21.2D, v3.2S, v23.2S + umlal v21.2d, v3.2s, v23.2s umull x24, w17, w7 - umlal v13.2D, v1.2S, v24.2S + umlal v13.2d, v1.2s, v24.2s add x19, x19, x19 - shl v29.2D, v29.2D, #1 + shl v29.2d, v29.2d, #1 umaddl x1, w2, w16, x1 - umull v15.2D, v1.2S, v23.2S + umull v15.2d, v1.2s, v23.2s umaddl x0, w27, w22, x0 - umlal v29.2D, v0.2S, v24.2S + umlal v29.2d, v0.2s, v24.2s umaddl x2, w28, w5, x24 - mul v24.2S, v23.2S, v31.2S + mul v24.2s, v23.2s, v31.2s umaddl x4, w28, w23, x4 - umlal v21.2D, v5.2S, v11.2S + umlal v21.2d, v5.2s, v11.2s umaddl x24, w27, w5, x20 - umlal v20.2D, v1.2S, v14.2S + umlal v20.2d, v1.2s, v14.2s umaddl x20, w11, w23, x19 - umlal v26.2D, v4.2S, v12.2S + umlal v26.2d, v4.2s, v12.2s umaddl x19, w27, w23, x2 - umlal v26.2D, v6.2S, v16.2S + umlal v26.2d, v6.2s, v16.2s umaddl x2, w21, w6, x4 - umlal v29.2D, v2.2S, v17.2S + umlal v29.2d, v2.2s, v17.2s umaddl x24, w8, w23, x24 - umlal v15.2D, v3.2S, v11.2S + umlal v15.2d, v3.2s, v11.2s umaddl x0, w21, w16, x0 umaddl x4, w21, w13, x19 - mul v23.2S, v11.2S, v31.2S - umlal v20.2D, v3.2S, v22.2S + mul v23.2s, v11.2s, v31.2s + umlal v20.2d, v3.2s, v22.2s umaddl x2, w12, w7, x2 - umlal v20.2D, v5.2S, v10.2S + umlal v20.2d, v5.2s, v10.2s umaddl x19, w12, w26, x0 - umlal v29.2D, v4.2S, v14.2S + umlal v29.2d, v4.2s, v14.2s umaddl x0, w12, w13, x24 - umlal v26.2D, v8.2S, v19.2S + umlal v26.2d, v8.2s, v19.2s umaddl x20, w15, w5, x20 - umlal v26.2D, v1.2S, v22.2S + umlal v26.2d, v1.2s, v22.2s umaddl x21, w15, w10, x9 - umlal v26.2D, v3.2S, v10.2S + umlal v26.2d, v3.2s, v10.2s and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce - umlal v29.2D, v6.2S, v22.2S + umlal v29.2d, v6.2s, v22.2s umaddl x20, w30, w7, x20 umaddl x1, w28, w22, x1 add x24, x19, x19 - umull v11.2D, v1.2S, v12.2S + umull v11.2d, v1.2s, v12.2s add w19, w3, w3, lsl #1; - umlal v26.2D, v5.2S, v18.2S + umlal v26.2d, v5.2s, v18.2s add w19, w19, w3, lsl #4 umaddl x20, w9, w6, x20 - umlal v29.2D, v8.2S, v10.2S + umlal v29.2d, v8.2s, v10.2s add w29, w9, w9, lsl #1; - umlal v13.2D, v3.2S, v17.2S + umlal v13.2d, v3.2s, v17.2s add w29, w29, w9, lsl #4 umaddl x2, w19, w10, x2 - umlal v11.2D, v3.2S, v16.2S + umlal v11.2d, v3.2s, v16.2s umaddl x21, w30, w14, x21 - umlal v11.2D, v5.2S, v19.2S + umlal v11.2d, v5.2s, v19.2s umaddl x20, w3, w13, x20 - umlal v11.2D, v7.2S, v24.2S + umlal v11.2d, v7.2s, v24.2s umaddl x2, w29, w22, x2 - umlal v11.2D, v9.2S, v23.2S + umlal v11.2d, v9.2s, v23.2s umaddl x21, w9, w26, x21 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 umaddl x1, w17, w10, x1 - umlal v13.2D, v5.2S, v14.2S + umlal v13.2d, v5.2s, v14.2s umaddl x24, w19, w5, x24 - umlal v27.2D, v3.2S, v14.2S + umlal v27.2d, v3.2s, v14.2s umaddl x21, w3, w16, x21 - shl v11.2D, v11.2D, #1 + shl v11.2d, v11.2d, #1 add w3, w30, w30, lsl #1; - umlal v28.2D, v5.2S, v16.2S + umlal v28.2d, v5.2s, v16.2s add w3, w3, w30, lsl #4 umaddl x24, w29, w23, x24 - umlal v28.2D, v7.2S, v19.2S + umlal v28.2d, v7.2s, v19.2s add x1, x1, x1 - umlal v28.2D, v9.2S, v24.2S + umlal v28.2d, v9.2s, v24.2s umaddl x1, w11, w5, x1 - umlal v15.2D, v5.2S, v12.2S + umlal v15.2d, v5.2s, v12.2s umaddl x24, w30, w13, x24 - umlal v15.2D, v7.2S, v16.2S + umlal v15.2d, v7.2s, v16.2s umaddl x25, w15, w14, x25 - umlal v15.2D, v9.2S, v19.2S + umlal v15.2d, v9.2s, v19.2s umaddl x1, w15, w7, x1 - shl v28.2D, v28.2D, #1 + shl v28.2d, v28.2d, #1 umaddl x24, w15, w6, x24 - umlal v21.2D, v7.2S, v12.2S + umlal v21.2d, v7.2s, v12.2s umaddl x2, w30, w16, x2 - umlal v21.2D, v9.2S, v16.2S + umlal v21.2d, v9.2s, v16.2s umaddl x25, w30, w26, x25 - shl v15.2D, v15.2D, #1 + shl v15.2d, v15.2d, #1 umaddl x30, w30, w6, x1 - umlal v28.2D, v0.2S, v22.2S + umlal v28.2d, v0.2s, v22.2s umaddl x1, w15, w26, x2 - umlal v28.2D, v2.2S, v10.2S + umlal v28.2d, v2.2s, v10.2s umaddl x2, w9, w16, x25 - shl v21.2D, v21.2D, #1 + shl v21.2d, v21.2d, #1 umaddl x24, w11, w7, x24 - umlal v15.2D, v0.2S, v14.2S + umlal v15.2d, v0.2s, v14.2s umaddl x1, w11, w14, x1 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s umaddl x25, w9, w13, x30 - umlal v28.2D, v4.2S, v18.2S + umlal v28.2d, v4.2s, v18.2s umaddl x0, w19, w26, x0 - umlal v25.2D, v2.2S, v12.2S + umlal v25.2d, v2.2s, v12.2s add x1, x1, x24, lsr #26 - umlal v25.2D, v4.2S, v16.2S + umlal v25.2d, v4.2s, v16.2s umaddl x30, w19, w22, x2 - umlal v21.2D, v2.2S, v14.2S + umlal v21.2d, v2.2s, v14.2s umaddl x4, w12, w6, x4 - mul v14.2S, v14.2S, v31.2S + mul v14.2s, v14.2s, v31.2s umaddl x25, w19, w23, x25 and x2, x1, #0x1ffffff - mul v16.2S, v17.2S, v31.2S - umlal v25.2D, v6.2S, v19.2S + mul v16.2s, v17.2s, v31.2s + umlal v25.2d, v6.2s, v19.2s umaddl x9, w19, w14, x4 - umlal v13.2D, v7.2S, v22.2S + umlal v13.2d, v7.2s, v22.2s add x25, x25, x1, lsr #25 - umlal v21.2D, v4.2S, v22.2S + umlal v21.2d, v4.2s, v22.2s umaddl x0, w29, w14, x0 - umlal v26.2D, v7.2S, v16.2S + umlal v26.2d, v7.2s, v16.2s add x30, x30, x25, lsr #26 - umlal v26.2D, v9.2S, v14.2S + umlal v26.2d, v9.2s, v14.2s add w1, w15, w15, lsl #1; - umlal v28.2D, v6.2S, v16.2S + umlal v28.2d, v6.2s, v16.2s add w1, w1, w15, lsl #4 add x4, x20, x30, lsr #25 - umlal v28.2D, v8.2S, v14.2S + umlal v28.2d, v8.2s, v14.2s and x25, x25, #0x3ffffff - umlal v15.2D, v2.2S, v22.2S + umlal v15.2d, v2.2s, v22.2s add x21, x21, x4, lsr #26 - umlal v11.2D, v0.2S, v10.2S + umlal v11.2d, v0.2s, v10.2s bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 - umlal v11.2D, v2.2S, v18.2S + umlal v11.2d, v2.2s, v18.2s bic x30, x21, #0x3ffffff - usra v26.2D, v28.2D, #26 + usra v26.2d, v28.2d, #26 lsr x20, x30, #26 - umlal v15.2D, v4.2S, v10.2S + umlal v15.2d, v4.2s, v10.2s add x20, x20, x30, lsr #25 - umlal v15.2D, v6.2S, v18.2S + umlal v15.2d, v6.2s, v18.2s umaddl x9, w29, w10, x9 - umlal v15.2D, v8.2S, v16.2S + umlal v15.2d, v8.2s, v16.2s add x30, x20, x30, lsr #22 - umlal v27.2D, v5.2S, v22.2S + umlal v27.2d, v5.2s, v22.2s umull x20, w17, w26 - umlal v20.2D, v7.2S, v18.2S + umlal v20.2d, v7.2s, v18.2s umaddl x30, w17, w16, x30 - umlal v20.2D, v9.2S, v16.2S + umlal v20.2d, v9.2s, v16.2s umaddl x17, w3, w10, x0 - usra v15.2D, v26.2D, #25 + usra v15.2d, v26.2d, #25 umaddl x0, w28, w14, x20 - umlal v27.2D, v7.2S, v10.2S + umlal v27.2d, v7.2s, v10.2s umaddl x20, w28, w26, x30 - umlal v27.2D, v9.2S, v18.2S + umlal v27.2d, v9.2s, v18.2s add w28, w12, w12, lsl #1; - usra v20.2D, v15.2D, #26 + usra v20.2d, v15.2d, #26 add w28, w28, w12, lsl #4 umaddl x30, w27, w10, x0 - and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + and v17.16b, v15.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 umaddl x27, w27, w14, x20 umaddl x0, w8, w10, x27 - mul v12.2S, v22.2S, v31.2S - and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + mul v12.2s, v22.2s, v31.2s + and v15.16b, v20.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 umaddl x14, w3, w22, x9 - umlal v21.2D, v6.2S, v10.2S + umlal v21.2d, v6.2s, v10.2s umaddl x27, w8, w22, x30 - trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + trn1 v15.4s, v17.4s, v15.4s // FINAL z3 umaddl x10, w28, w22, x0 - umlal v11.2D, v4.2S, v16.2S + umlal v11.2d, v4.2s, v16.2s umaddl x30, w15, w16, x14 - and v26.16B, v26.16B, v23.16B + and v26.16b, v26.16b, v23.16b umaddl x28, w12, w16, x27 - umlal v21.2D, v8.2S, v18.2S + umlal v21.2d, v8.2s, v18.2s add x10, x10, x10 - umlal v25.2D, v8.2S, v24.2S + umlal v25.2d, v8.2s, v24.2s umaddl x20, w19, w6, x10 - umlal v25.2D, v1.2S, v10.2S + umlal v25.2d, v1.2s, v10.2s add x28, x28, x28 - umlal v25.2D, v3.2S, v18.2S + umlal v25.2d, v3.2s, v18.2s umaddl x28, w19, w7, x28 - usra v21.2D, v20.2D, #25 + usra v21.2d, v20.2d, #25 umaddl x0, w29, w7, x20 - umlal v11.2D, v6.2S, v14.2S + umlal v11.2d, v6.2s, v14.2s umaddl x10, w11, w26, x30 - umlal v13.2D, v9.2S, v10.2S + umlal v13.2d, v9.2s, v10.2s umaddl x19, w29, w5, x28 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 umaddl x0, w3, w5, x0 - umlal v25.2D, v5.2S, v16.2S + umlal v25.2d, v5.2s, v16.2s umaddl x20, w1, w22, x17 - and v20.16B, v28.16B, v30.16B + and v20.16b, v28.16b, v30.16b umaddl x29, w3, w23, x19 - usra v29.2D, v27.2D, #25 + usra v29.2d, v27.2d, #25 umaddl x3, w1, w23, x0 - and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 - umlal v11.2D, v8.2S, v12.2S + and v27.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2d, v8.2s, v12.2s umaddl x12, w15, w13, x29 - usra v13.2D, v29.2D, #26 + usra v13.2d, v29.2d, #26 umaddl x7, w11, w13, x3 - trn1 v6.4S, v6.4S, v7.4S + trn1 v6.4s, v6.4s, v7.4s umaddl x17, w11, w16, x20 - umlal v25.2D, v7.2S, v14.2S + umlal v25.2d, v7.2s, v14.2s and x23, x4, #0x3ffffff - bic v19.16B, v13.16B, v23.16B + bic v19.16b, v13.16b, v23.16b umaddl x19, w11, w6, x12 - and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v13.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 add x3, x17, x7, lsr #26 - usra v11.2D, v19.2D, #25 - trn1 v2.4S, v2.4S, v3.4S + usra v11.2d, v19.2d, #25 + trn1 v2.4s, v2.4s, v3.4s add x17, x19, x3, lsr #25 - and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and v13.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 and x5, x7, #0x3ffffff - usra v11.2D, v19.2D, #24 + usra v11.2d, v19.2d, #24 add x7, x10, x17, lsr #26 - trn1 v0.4S, v0.4S, v1.4S + trn1 v0.4s, v0.4s, v1.4s and x19, x24, #0x3ffffff - and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v29.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 add x29, x19, x7, lsr #25 - usra v11.2D, v19.2D, #21 + usra v11.2d, v19.2d, #21 bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 - trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + trn1 v17.4s, v13.4s, v27.4s // FINAL z3 add x19, x2, x29, lsr #26 - trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + trn1 v19.4s, v21.4s, v28.4s // FINAL z3 and x3, x29, #0x3ffffff mov v16.d[0], v6.d[1] // FINAL x3 mov v6.d[0], v17.d[1] // FINAL x2 - trn1 v8.4S, v8.4S, v9.4S + trn1 v8.4s, v8.4s, v9.4s bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 - and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v11.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 mov v18.d[0], v8.d[1] // FINAL x3 mov v8.d[0], v19.d[1] // FINAL x2 - umlal v25.2D, v9.2S, v12.2S + umlal v25.2d, v9.2s, v12.2s mov v9.d[0], x23 // FINAL z2 mov v7.d[0], x25 // FINAL z2 ldr d29, [mask1] mov v12.d[0], v2.d[1] // FINAL x3 - trn1 v4.4S, v4.4S, v5.4S + trn1 v4.4s, v4.4s, v5.4s and x17, x17, #0x3ffffff - usra v25.2D, v11.2D, #26 + usra v25.2d, v11.2d, #26 mov v10.d[0], v0.d[1] // FINAL x3 mov v14.d[0], v4.d[1] // FINAL x3 mov v4.d[0], v15.d[1] // FINAL x2 - usra v20.2D, v25.2D, #25 - and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + usra v20.2d, v25.2d, #25 + and v27.16b, v25.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 mov v5.d[0], x3 // depth 86 mov v1.d[0], x5 // FINAL z2 - usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 - and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 - trn1 v11.4S, v21.4S, v27.4S // FINAL z3 - trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + usra v26.2d, v20.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v20.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4s, v21.4s, v27.4s // FINAL z3 + trn1 v13.4s, v28.4s, v26.4s // FINAL z3 mov v0.d[0], v11.d[1] // FINAL x2 mov v3.d[0], x17 // FINAL z2 mov v2.d[0], v13.d[1] // FINAL x2 From 183d8240d923a93a971e450364045a3efeb3ad93 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 6 Mar 2024 21:28:47 -0800 Subject: [PATCH 15/24] Remove one redundant instruction, tidy a few proofs with WORD_BLAST There was a pointless duplication of the first instruction of the x86 version of bignum_mul_p521_alt, moving one of the input arguments out of rdx. A number of explicit incantations expanding sums bitwise are now replaced by simple applications of WORD_BLAST, which is in effect automating the same sort of pattern, and more. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/aa9d9a71c5cc81e7f22e84adc839ccaeac21a5f1 --- x86_att/p521/bignum_mul_p521_alt.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/x86_att/p521/bignum_mul_p521_alt.S b/x86_att/p521/bignum_mul_p521_alt.S index f87546928a..a769fa0b3a 100644 --- a/x86_att/p521/bignum_mul_p521_alt.S +++ b/x86_att/p521/bignum_mul_p521_alt.S @@ -74,10 +74,6 @@ S2N_BN_SYMBOL(bignum_mul_p521_alt): movq %rdx, y -// Copy y into a safe register to start with - - mov %rdx, y - // Start doing a conventional columnwise multiplication, // temporarily storing the lower 9 digits to the stack. // Start with result term 0 From 8478655720b3c9417c1b8e2557f5666b35c0e793 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 7 Mar 2024 17:45:24 -0800 Subject: [PATCH 16/24] Bifurcate microarchitectural variants of Weierstrass doublings The Jacobian point doubling operations for the curves P-256, P-384, P-521, secp256k1 and SM2 now all have the usual two versions targeting different microarchitectures, one of them called "_alt", following the general s2n-bignum convention. The "_alt" forms for ARM now present are just renamed versions of the originals (which were based on "_alt" field operations), with the new code taking over the old non-alt name. For x86 the non-alt ones are the same as before and the "_alt" forms are new. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/38c65015fbb705641ed9696e8a1785060ecf38d2 --- arm/p384/Makefile | 1 + arm/p384/p384_montjdouble.S | 1107 ++++++++------- arm/p384/p384_montjdouble_alt.S | 951 +++++++++++++ arm/p521/Makefile | 1 + arm/p521/p521_jdouble.S | 1980 ++++++++++++++------------- arm/p521/p521_jdouble_alt.S | 1458 ++++++++++++++++++++ x86_att/p384/p384_montjdouble_alt.S | 1196 ++++++++++++++++ x86_att/p521/p521_jdouble_alt.S | 1865 +++++++++++++++++++++++++ 8 files changed, 7126 insertions(+), 1433 deletions(-) create mode 100644 arm/p384/p384_montjdouble_alt.S create mode 100644 arm/p521/p521_jdouble_alt.S create mode 100644 x86_att/p384/p384_montjdouble_alt.S create mode 100644 x86_att/p521/p521_jdouble_alt.S diff --git a/arm/p384/Makefile b/arm/p384/Makefile index 564b9dd93c..bfc0870b40 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -46,6 +46,7 @@ OBJ = bignum_add_p384.o \ bignum_triple_p384.o \ p384_montjadd.o \ p384_montjdouble.o \ + p384_montjdouble_alt.o \ p384_montjmixadd.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p384/p384_montjdouble.S b/arm/p384/p384_montjdouble.S index 7dfd9766f2..3f92103cad 100644 --- a/arm/p384/p384_montjdouble.S +++ b/arm/p384/p384_montjdouble.S @@ -26,8 +26,8 @@ // Stable homes for input arguments during main code sequence -#define input_z x23 -#define input_x x24 +#define input_z x25 +#define input_x x26 // Pointer-offset pairs for inputs and outputs @@ -56,501 +56,652 @@ #define d sp, #(NUMSIZE*6) #define x4p sp, #(NUMSIZE*6) -#define NSPACE (NUMSIZE*7) +#define NSPACE #(NUMSIZE*7) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds exactly to bignum_montmul_p384 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x24, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x3, x4; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x3, x5; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x4, x5; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x24, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x6, x7; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x6, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x7, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x24, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x6, x7; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x6, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x7, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x24, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x24; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ + adds x10, x10, x8; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + sbcs x13, x13, xzr; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ + sbcs x13, x13, xzr; \ + sbcs x8, x8, xzr; \ + sbc x9, x9, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -876,10 +1027,11 @@ S2N_BN_SYMBOL(p384_montjdouble): // Save regs and make room on stack for temporary variables - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - sub sp, sp, NSPACE + sub sp, sp, NSPACE+64 + stp x19, x20, [sp, NSPACE] + stp x21, x22, [sp, NSPACE+16] + stp x23, x24, [sp, NSPACE+32] + stp x25, x26, [sp, NSPACE+48] // Move the input arguments to stable places @@ -938,12 +1090,11 @@ S2N_BN_SYMBOL(p384_montjdouble): // Restore stack and registers - add sp, sp, NSPACE - - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - + ldp x19, x20, [sp, NSPACE] + ldp x21, x22, [sp, NSPACE+16] + ldp x23, x24, [sp, NSPACE+32] + ldp x25, x26, [sp, NSPACE+48] + add sp, sp, NSPACE+64 ret #if defined(__linux__) && defined(__ELF__) diff --git a/arm/p384/p384_montjdouble_alt.S b/arm/p384/p384_montjdouble_alt.S new file mode 100644 index 0000000000..0e83ff4a98 --- /dev/null +++ b/arm/p384/p384_montjdouble_alt.S @@ -0,0 +1,951 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble_alt +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x23 +#define input_x x24 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// Corresponds exactly to bignum_add_p384 + +#define add_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + adc x3, xzr, xzr; \ + mov x4, #0xffffffff; \ + cmp x5, x4; \ + mov x4, #0xffffffff00000000; \ + sbcs xzr, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + sbcs xzr, x7, x4; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adcs xzr, x10, xzr; \ + adcs x3, x3, xzr; \ + csetm x3, ne; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + ldp x1, x2, [P1]; \ + ldp x3, x4, [P1+16]; \ + ldp x5, x6, [P1+32]; \ + lsl x0, x1, #2; \ + ldp x7, x8, [P2]; \ + subs x0, x0, x7; \ + extr x1, x2, x1, #62; \ + sbcs x1, x1, x8; \ + ldp x7, x8, [P2+16]; \ + extr x2, x3, x2, #62; \ + sbcs x2, x2, x7; \ + extr x3, x4, x3, #62; \ + sbcs x3, x3, x8; \ + extr x4, x5, x4, #62; \ + ldp x7, x8, [P2+32]; \ + sbcs x4, x4, x7; \ + extr x5, x6, x5, #62; \ + sbcs x5, x5, x8; \ + lsr x6, x6, #62; \ + adc x6, x6, xzr; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x8, cc; \ + mov x9, #0xffffffff; \ + and x9, x9, x8; \ + adds x0, x0, x9; \ + eor x9, x9, x8; \ + adcs x1, x1, x9; \ + mov x9, #0xfffffffffffffffe; \ + and x9, x9, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x8; \ + adcs x4, x4, x8; \ + adc x5, x5, x8; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + mov x12, D; \ + mul x0, x12, x6; \ + mul x1, x12, x7; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x4, x12, x10; \ + mul x5, x12, x11; \ + umulh x6, x12, x6; \ + umulh x7, x12, x7; \ + umulh x8, x12, x8; \ + umulh x9, x12, x9; \ + umulh x10, x12, x10; \ + umulh x12, x12, x11; \ + adds x1, x1, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x8; \ + adcs x4, x4, x9; \ + adcs x5, x5, x10; \ + mov x6, #1; \ + adc x6, x12, x6; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, C; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + csetm x3, cs; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + lsl x0, x6, #3; \ + extr x1, x7, x6, #61; \ + extr x2, x8, x7, #61; \ + extr x3, x9, x8, #61; \ + extr x4, x10, x9, #61; \ + extr x5, x11, x10, #61; \ + lsr x6, x11, #61; \ + add x6, x6, #1; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, 3; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +S2N_BN_SYMBOL(p384_montjdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p521/Makefile b/arm/p521/Makefile index ae0d4f8d70..7231f2ad9f 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -46,6 +46,7 @@ OBJ = bignum_add_p521.o \ bignum_triple_p521.o \ p521_jadd.o \ p521_jdouble.o \ + p521_jdouble_alt.o \ p521_jmixadd.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p521/p521_jdouble.S b/arm/p521/p521_jdouble.S index 100f6d3e87..55a701ba92 100644 --- a/arm/p521/p521_jdouble.S +++ b/arm/p521/p521_jdouble.S @@ -61,616 +61,1052 @@ #define NSPACE (NUMSIZE*7+8) -// Corresponds exactly to bignum_mul_p521_alt +// Corresponds exactly to bignum_mul_p521 except that the +// destination buffer P0 is used as a temporary storage, +// also swapping some load/store orders to avoid aliasing +// troubles; also x0 is used in place of x26. #define mul_p521(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P2]; \ + ldp x9, x10, [P2+16]; \ + mul x11, x3, x7; \ + mul x15, x4, x8; \ + mul x16, x5, x9; \ + mul x17, x6, x10; \ umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ + adds x15, x15, x19; \ + umulh x19, x4, x8; \ + adcs x16, x16, x19; \ + umulh x19, x5, x9; \ + adcs x17, x17, x19; \ + umulh x19, x6, x10; \ + adc x19, x19, xzr; \ + adds x12, x15, x11; \ + adcs x15, x16, x15; \ + adcs x16, x17, x16; \ + adcs x17, x19, x17; \ + adc x19, xzr, x19; \ + adds x13, x15, x11; \ + adcs x14, x16, x12; \ + adcs x15, x17, x15; \ + adcs x16, x19, x16; \ + adcs x17, xzr, x17; \ + adc x19, xzr, x19; \ + subs x24, x5, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x16, x16, x23; \ + eor x22, x22, x21; \ + adcs x17, x17, x22; \ + adc x19, x19, x21; \ + subs x24, x3, x4; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x8, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x12, x12, x23; \ + eor x22, x22, x21; \ + adcs x13, x13, x22; \ + adcs x14, x14, x21; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x15, x15, x23; \ + eor x22, x22, x21; \ + adcs x16, x16, x22; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x13, x13, x23; \ + eor x22, x22, x21; \ + adcs x14, x14, x22; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + lsl x21, x11, #9; \ + extr x11, x12, x11, #55; \ + extr x12, x13, x12, #55; \ + extr x13, x14, x13, #55; \ + lsr x14, x14, #55; \ + ldp x3, x4, [P1+32]; \ + ldp x5, x6, [P1+48]; \ + ldp x7, x8, [P2+32]; \ + ldp x9, x10, [P2+48]; \ stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ + stp x21, x11, [P0+32]; \ + stp x12, x13, [P0+48]; \ + str x14, [P0+64]; \ + mul x11, x3, x7; \ + mul x15, x4, x8; \ + mul x16, x5, x9; \ + mul x17, x6, x10; \ + umulh x19, x3, x7; \ + adds x15, x15, x19; \ + umulh x19, x4, x8; \ + adcs x16, x16, x19; \ + umulh x19, x5, x9; \ + adcs x17, x17, x19; \ + umulh x19, x6, x10; \ + adc x19, x19, xzr; \ + adds x12, x15, x11; \ + adcs x15, x16, x15; \ + adcs x16, x17, x16; \ + adcs x17, x19, x17; \ + adc x19, xzr, x19; \ + adds x13, x15, x11; \ + adcs x14, x16, x12; \ + adcs x15, x17, x15; \ + adcs x16, x19, x16; \ + adcs x17, xzr, x17; \ + adc x19, xzr, x19; \ + subs x24, x5, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x16, x16, x23; \ + eor x22, x22, x21; \ + adcs x17, x17, x22; \ + adc x19, x19, x21; \ + subs x24, x3, x4; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x8, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x12, x12, x23; \ + eor x22, x22, x21; \ + adcs x13, x13, x22; \ + adcs x14, x14, x21; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x15, x15, x23; \ + eor x22, x22, x21; \ + adcs x16, x16, x22; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x13, x13, x23; \ + eor x22, x22, x21; \ + adcs x14, x14, x22; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + ldp x23, x22, [P0]; \ + adds x11, x11, x23; \ + adcs x12, x12, x22; \ + stp x11, x12, [P0]; \ + ldp x23, x22, [P0+16]; \ + adcs x13, x13, x23; \ + adcs x14, x14, x22; \ + stp x13, x14, [P0+16]; \ + ldp x23, x22, [P0+32]; \ + adcs x15, x15, x23; \ + adcs x16, x16, x22; \ + stp x15, x16, [P0+32]; \ + ldp x23, x22, [P0+48]; \ + adcs x17, x17, x23; \ + adcs x19, x19, x22; \ + stp x17, x19, [P0+48]; \ + ldr x21, [P0+64]; \ + adc x21, x21, xzr; \ + str x21, [P0+64]; \ + ldp x23, x22, [P1]; \ + subs x3, x3, x23; \ + sbcs x4, x4, x22; \ + ldp x23, x22, [P1+16]; \ + sbcs x5, x5, x23; \ + sbcs x6, x6, x22; \ + csetm x24, lo; \ + ldp x23, x22, [P2]; \ + subs x7, x23, x7; \ + sbcs x8, x22, x8; \ + ldp x23, x22, [P2+16]; \ + sbcs x9, x23, x9; \ + sbcs x10, x22, x10; \ + csetm x25, lo; \ + eor x3, x3, x24; \ + subs x3, x3, x24; \ + eor x4, x4, x24; \ + sbcs x4, x4, x24; \ + eor x5, x5, x24; \ + sbcs x5, x5, x24; \ + eor x6, x6, x24; \ + sbc x6, x6, x24; \ + eor x7, x7, x25; \ + subs x7, x7, x25; \ + eor x8, x8, x25; \ + sbcs x8, x8, x25; \ + eor x9, x9, x25; \ + sbcs x9, x9, x25; \ + eor x10, x10, x25; \ + sbc x10, x10, x25; \ + eor x25, x25, x24; \ + mul x11, x3, x7; \ + mul x15, x4, x8; \ + mul x16, x5, x9; \ + mul x17, x6, x10; \ + umulh x19, x3, x7; \ + adds x15, x15, x19; \ + umulh x19, x4, x8; \ + adcs x16, x16, x19; \ + umulh x19, x5, x9; \ + adcs x17, x17, x19; \ + umulh x19, x6, x10; \ + adc x19, x19, xzr; \ + adds x12, x15, x11; \ + adcs x15, x16, x15; \ + adcs x16, x17, x16; \ + adcs x17, x19, x17; \ + adc x19, xzr, x19; \ + adds x13, x15, x11; \ + adcs x14, x16, x12; \ + adcs x15, x17, x15; \ + adcs x16, x19, x16; \ + adcs x17, xzr, x17; \ + adc x19, xzr, x19; \ + subs x24, x5, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x16, x16, x23; \ + eor x22, x22, x21; \ + adcs x17, x17, x22; \ + adc x19, x19, x21; \ + subs x24, x3, x4; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x8, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x12, x12, x23; \ + eor x22, x22, x21; \ + adcs x13, x13, x22; \ + adcs x14, x14, x21; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x15, x15, x23; \ + eor x22, x22, x21; \ + adcs x16, x16, x22; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x13, x13, x23; \ + eor x22, x22, x21; \ + adcs x14, x14, x22; \ + adcs x15, x15, x21; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x3, x6; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x10, x7; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + subs x24, x4, x5; \ + cneg x24, x24, lo; \ + csetm x21, lo; \ + subs x22, x9, x8; \ + cneg x22, x22, lo; \ + mul x23, x24, x22; \ + umulh x22, x24, x22; \ + cinv x21, x21, lo; \ + cmn x21, #1; \ + eor x23, x23, x21; \ + adcs x14, x14, x23; \ + eor x22, x22, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x21; \ + adcs x17, x17, x21; \ + adc x19, x19, x21; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + eor x11, x11, x25; \ + adds x11, x11, x3; \ + eor x12, x12, x25; \ + adcs x12, x12, x4; \ + eor x13, x13, x25; \ + adcs x13, x13, x5; \ + eor x14, x14, x25; \ + adcs x14, x14, x6; \ + eor x15, x15, x25; \ + ldp x7, x8, [P0+32]; \ + ldp x9, x10, [P0+48]; \ + ldr x20, [P0+64]; \ + adcs x15, x15, x7; \ + eor x16, x16, x25; \ + adcs x16, x16, x8; \ + eor x17, x17, x25; \ + adcs x17, x17, x9; \ + eor x19, x19, x25; \ + adcs x19, x19, x10; \ + adc x21, x20, xzr; \ + adds x15, x15, x3; \ + adcs x16, x16, x4; \ + adcs x17, x17, x5; \ + adcs x19, x19, x6; \ + and x25, x25, #0x1ff; \ + lsl x24, x11, #9; \ + orr x24, x24, x25; \ + adcs x7, x7, x24; \ + extr x24, x12, x11, #55; \ + adcs x8, x8, x24; \ + extr x24, x13, x12, #55; \ + adcs x9, x9, x24; \ + extr x24, x14, x13, #55; \ + adcs x10, x10, x24; \ + lsr x24, x14, #55; \ + adc x20, x24, x20; \ + ldr x6, [P2+64]; \ + ldp x3, x4, [P1]; \ + and x23, x3, #0xfffffffffffff; \ + mul x23, x6, x23; \ + ldr x14, [P1+64]; \ + ldp x11, x12, [P2]; \ + and x24, x11, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x23, x23, x24; \ + extr x24, x4, x3, #52; \ + and x24, x24, #0xfffffffffffff; \ + mul x22, x6, x24; \ + extr x24, x12, x11, #52; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x22, x22, x24; \ + lsr x24, x23, #52; \ + add x22, x22, x24; \ + lsl x23, x23, #12; \ + extr x24, x22, x23, #12; \ + adds x15, x15, x24; \ + ldp x5, x3, [P1+16]; \ + ldp x13, x11, [P2+16]; \ + extr x24, x5, x4, #40; \ + and x24, x24, #0xfffffffffffff; \ + mul x23, x6, x24; \ + extr x24, x13, x12, #40; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x23, x23, x24; \ + lsr x24, x22, #52; \ + add x23, x23, x24; \ + lsl x22, x22, #12; \ + extr x24, x23, x22, #24; \ + adcs x16, x16, x24; \ + extr x24, x3, x5, #28; \ + and x24, x24, #0xfffffffffffff; \ + mul x22, x6, x24; \ + extr x24, x11, x13, #28; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x22, x22, x24; \ + lsr x24, x23, #52; \ + add x22, x22, x24; \ + lsl x23, x23, #12; \ + extr x24, x22, x23, #36; \ + adcs x17, x17, x24; \ + and x0, x16, x17; \ + ldp x4, x5, [P1+32]; \ + ldp x12, x13, [P2+32]; \ + extr x24, x4, x3, #16; \ + and x24, x24, #0xfffffffffffff; \ + mul x23, x6, x24; \ + extr x24, x12, x11, #16; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x23, x23, x24; \ + lsl x21, x21, #48; \ + add x23, x23, x21; \ + lsr x24, x22, #52; \ + add x23, x23, x24; \ + lsl x22, x22, #12; \ + extr x24, x23, x22, #48; \ + adcs x19, x19, x24; \ + and x0, x0, x19; \ + lsr x24, x4, #4; \ + and x24, x24, #0xfffffffffffff; \ + mul x22, x6, x24; \ + lsr x24, x12, #4; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x22, x22, x24; \ + lsr x24, x23, #52; \ + add x22, x22, x24; \ + lsl x23, x23, #12; \ + extr x25, x22, x23, #60; \ + extr x24, x5, x4, #56; \ + and x24, x24, #0xfffffffffffff; \ + mul x23, x6, x24; \ + extr x24, x13, x12, #56; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x23, x23, x24; \ + lsr x24, x22, #52; \ + add x23, x23, x24; \ + lsl x25, x25, #8; \ + extr x24, x23, x25, #8; \ + adcs x7, x7, x24; \ + and x0, x0, x7; \ ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + extr x24, x3, x5, #44; \ + and x24, x24, #0xfffffffffffff; \ + mul x22, x6, x24; \ + extr x24, x11, x13, #44; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x22, x22, x24; \ + lsr x24, x23, #52; \ + add x22, x22, x24; \ + lsl x23, x23, #12; \ + extr x24, x22, x23, #20; \ + adcs x8, x8, x24; \ + and x0, x0, x8; \ + extr x24, x4, x3, #32; \ + and x24, x24, #0xfffffffffffff; \ + mul x23, x6, x24; \ + extr x24, x12, x11, #32; \ + and x24, x24, #0xfffffffffffff; \ + mul x24, x14, x24; \ + add x23, x23, x24; \ + lsr x24, x22, #52; \ + add x23, x23, x24; \ + lsl x22, x22, #12; \ + extr x24, x23, x22, #32; \ + adcs x9, x9, x24; \ + and x0, x0, x9; \ + lsr x24, x4, #20; \ + mul x22, x6, x24; \ + lsr x24, x12, #20; \ + mul x24, x14, x24; \ + add x22, x22, x24; \ + lsr x24, x23, #52; \ + add x22, x22, x24; \ + lsl x23, x23, #12; \ + extr x24, x22, x23, #44; \ + adcs x10, x10, x24; \ + and x0, x0, x10; \ + mul x24, x6, x14; \ + lsr x22, x22, #44; \ + add x24, x24, x22; \ + adc x20, x20, x24; \ + lsr x22, x20, #9; \ + orr x20, x20, #0xfffffffffffffe00; \ cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + adcs xzr, x15, x22; \ + adcs xzr, x0, xzr; \ + adcs xzr, x20, xzr; \ + adcs x15, x15, x22; \ + adcs x16, x16, xzr; \ + adcs x17, x17, xzr; \ + adcs x19, x19, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x20, x20, xzr; \ + and x22, x15, #0x1ff; \ + extr x15, x16, x15, #9; \ + extr x16, x17, x16, #9; \ + stp x15, x16, [P0]; \ + extr x17, x19, x17, #9; \ + extr x19, x7, x19, #9; \ + stp x17, x19, [P0+16]; \ + extr x7, x8, x7, #9; \ + extr x8, x9, x8, #9; \ + stp x7, x8, [P0+32]; \ + extr x9, x10, x9, #9; \ + extr x10, x20, x10, #9; \ + stp x9, x10, [P0+48]; \ + str x22, [P0+64] -// Corresponds exactly to bignum_sqr_p521_alt +// Corresponds exactly to bignum_sqr_p521 #define sqr_p521(P0,P1) \ ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ + mul x12, x6, x8; \ + mul x17, x7, x9; \ + umulh x22, x6, x8; \ + subs x23, x6, x7; \ + cneg x23, x23, lo; \ + csetm x11, lo; \ + subs x10, x9, x8; \ + cneg x10, x10, lo; \ + mul x16, x23, x10; \ + umulh x10, x23, x10; \ + cinv x11, x11, lo; \ + eor x16, x16, x11; \ + eor x10, x10, x11; \ + adds x13, x12, x22; \ + adc x22, x22, xzr; \ + umulh x23, x7, x9; \ + adds x13, x13, x17; \ + adcs x22, x22, x23; \ adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ + adds x22, x22, x17; \ + adc x23, x23, xzr; \ + cmn x11, #1; \ + adcs x13, x13, x16; \ adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ + adc x23, x23, x11; \ + adds x12, x12, x12; \ adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ adcs x22, x22, x22; \ adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ + adc x19, xzr, xzr; \ mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ + mul x16, x7, x7; \ + mul x21, x6, x7; \ + umulh x11, x6, x6; \ + umulh x17, x7, x7; \ + umulh x20, x6, x7; \ + adds x11, x11, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x11, x11, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x12, x12, x16; \ + adcs x13, x13, x17; \ + adcs x22, x22, xzr; \ + adcs x23, x23, xzr; \ + adc x19, x19, xzr; \ + mul x14, x8, x8; \ + mul x16, x9, x9; \ + mul x21, x8, x9; \ + umulh x15, x8, x8; \ + umulh x17, x9, x9; \ + umulh x20, x8, x9; \ + adds x15, x15, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x15, x15, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x14, x14, x22; \ + adcs x15, x15, x23; \ + adcs x16, x16, x19; \ + adc x17, x17, xzr; \ + ldr x19, [P1+64]; \ + add x23, x19, x19; \ + mul x19, x19, x19; \ + and x21, x2, #0xfffffffffffff; \ + mul x21, x23, x21; \ + extr x20, x3, x2, #52; \ + and x20, x20, #0xfffffffffffff; \ + mul x20, x23, x20; \ + lsr x22, x21, #52; \ + add x20, x20, x22; \ + lsl x21, x21, #12; \ + extr x22, x20, x21, #12; \ + adds x10, x10, x22; \ + extr x21, x4, x3, #40; \ + and x21, x21, #0xfffffffffffff; \ + mul x21, x23, x21; \ + lsr x22, x20, #52; \ + add x21, x21, x22; \ + lsl x20, x20, #12; \ + extr x22, x21, x20, #24; \ + adcs x11, x11, x22; \ + extr x20, x5, x4, #28; \ + and x20, x20, #0xfffffffffffff; \ + mul x20, x23, x20; \ + lsr x22, x21, #52; \ + add x20, x20, x22; \ + lsl x21, x21, #12; \ + extr x22, x20, x21, #36; \ + adcs x12, x12, x22; \ + extr x21, x6, x5, #16; \ + and x21, x21, #0xfffffffffffff; \ + mul x21, x23, x21; \ + lsr x22, x20, #52; \ + add x21, x21, x22; \ + lsl x20, x20, #12; \ + extr x22, x21, x20, #48; \ + adcs x13, x13, x22; \ + lsr x20, x6, #4; \ + and x20, x20, #0xfffffffffffff; \ + mul x20, x23, x20; \ + lsr x22, x21, #52; \ + add x20, x20, x22; \ + lsl x21, x21, #12; \ + extr x24, x20, x21, #60; \ + extr x21, x7, x6, #56; \ + and x21, x21, #0xfffffffffffff; \ + mul x21, x23, x21; \ + lsr x22, x20, #52; \ + add x21, x21, x22; \ + lsl x24, x24, #8; \ + extr x22, x21, x24, #8; \ + adcs x14, x14, x22; \ + extr x20, x8, x7, #44; \ + and x20, x20, #0xfffffffffffff; \ + mul x20, x23, x20; \ + lsr x22, x21, #52; \ + add x20, x20, x22; \ + lsl x21, x21, #12; \ + extr x22, x20, x21, #20; \ + adcs x15, x15, x22; \ + extr x21, x9, x8, #32; \ + and x21, x21, #0xfffffffffffff; \ + mul x21, x23, x21; \ + lsr x22, x20, #52; \ + add x21, x21, x22; \ + lsl x20, x20, #12; \ + extr x22, x21, x20, #32; \ + adcs x16, x16, x22; \ + lsr x20, x9, #20; \ + mul x20, x23, x20; \ + lsr x22, x21, #52; \ + add x20, x20, x22; \ + lsl x21, x21, #12; \ + extr x22, x20, x21, #44; \ + adcs x17, x17, x22; \ + lsr x20, x20, #44; \ + adc x19, x19, x20; \ + extr x21, x11, x10, #9; \ + extr x20, x12, x11, #9; \ + stp x21, x20, [P0]; \ + extr x21, x13, x12, #9; \ + extr x20, x14, x13, #9; \ + stp x21, x20, [P0+16]; \ + extr x21, x15, x14, #9; \ + extr x20, x16, x15, #9; \ + stp x21, x20, [P0+32]; \ + extr x21, x17, x16, #9; \ + extr x20, x19, x17, #9; \ + stp x21, x20, [P0+48]; \ + and x22, x10, #0x1ff; \ + lsr x19, x19, #9; \ + add x22, x22, x19; \ + str x22, [P0+64]; \ + mul x12, x2, x4; \ + mul x17, x3, x5; \ + umulh x22, x2, x4; \ + subs x23, x2, x3; \ + cneg x23, x23, lo; \ + csetm x11, lo; \ + subs x10, x5, x4; \ + cneg x10, x10, lo; \ + mul x16, x23, x10; \ + umulh x10, x23, x10; \ + cinv x11, x11, lo; \ + eor x16, x16, x11; \ + eor x10, x10, x11; \ + adds x13, x12, x22; \ + adc x22, x22, xzr; \ + umulh x23, x3, x5; \ + adds x13, x13, x17; \ + adcs x22, x22, x23; \ + adc x23, x23, xzr; \ + adds x22, x22, x17; \ + adc x23, x23, xzr; \ + cmn x11, #1; \ + adcs x13, x13, x16; \ adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ + adc x23, x23, x11; \ + adds x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adc x19, xzr, xzr; \ + mul x10, x2, x2; \ + mul x16, x3, x3; \ + mul x21, x2, x3; \ + umulh x11, x2, x2; \ + umulh x17, x3, x3; \ + umulh x20, x2, x3; \ + adds x11, x11, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x11, x11, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x12, x12, x16; \ + adcs x13, x13, x17; \ + adcs x22, x22, xzr; \ + adcs x23, x23, xzr; \ + adc x19, x19, xzr; \ + mul x14, x4, x4; \ + mul x16, x5, x5; \ + mul x21, x4, x5; \ + umulh x15, x4, x4; \ + umulh x17, x5, x5; \ + umulh x20, x4, x5; \ + adds x15, x15, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x15, x15, x21; \ + adcs x16, x16, x20; \ + adc x17, x17, xzr; \ + adds x14, x14, x22; \ + adcs x15, x15, x23; \ + adcs x16, x16, x19; \ + adc x17, x17, xzr; \ + ldp x21, x20, [P0]; \ + adds x21, x21, x10; \ + adcs x20, x20, x11; \ + stp x21, x20, [P0]; \ + ldp x21, x20, [P0+16]; \ + adcs x21, x21, x12; \ + adcs x20, x20, x13; \ + stp x21, x20, [P0+16]; \ + ldp x21, x20, [P0+32]; \ + adcs x21, x21, x14; \ + adcs x20, x20, x15; \ + stp x21, x20, [P0+32]; \ + ldp x21, x20, [P0+48]; \ + adcs x21, x21, x16; \ + adcs x20, x20, x17; \ + stp x21, x20, [P0+48]; \ + ldr x22, [P0+64]; \ + adc x22, x22, xzr; \ + str x22, [P0+64]; \ + mul x10, x2, x6; \ + mul x14, x3, x7; \ + mul x15, x4, x8; \ + mul x16, x5, x9; \ + umulh x17, x2, x6; \ + adds x14, x14, x17; \ + umulh x17, x3, x7; \ + adcs x15, x15, x17; \ + umulh x17, x4, x8; \ + adcs x16, x16, x17; \ + umulh x17, x5, x9; \ + adc x17, x17, xzr; \ + adds x11, x14, x10; \ + adcs x14, x15, x14; \ + adcs x15, x16, x15; \ + adcs x16, x17, x16; \ + adc x17, xzr, x17; \ + adds x12, x14, x10; \ + adcs x13, x15, x11; \ + adcs x14, x16, x14; \ + adcs x15, x17, x15; \ + adcs x16, xzr, x16; \ + adc x17, xzr, x17; \ + subs x22, x4, x5; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x9, x8; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x15, x15, x21; \ + eor x20, x20, x19; \ + adcs x16, x16, x20; \ + adc x17, x17, x19; \ + subs x22, x2, x3; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x7, x6; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x11, x11, x21; \ + eor x20, x20, x19; \ + adcs x12, x12, x20; \ + adcs x13, x13, x19; \ + adcs x14, x14, x19; \ + adcs x15, x15, x19; \ + adcs x16, x16, x19; \ + adc x17, x17, x19; \ + subs x22, x3, x5; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x9, x7; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x14, x14, x21; \ + eor x20, x20, x19; \ + adcs x15, x15, x20; \ + adcs x16, x16, x19; \ + adc x17, x17, x19; \ + subs x22, x2, x4; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x8, x6; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x12, x12, x21; \ + eor x20, x20, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x19; \ + adcs x15, x15, x19; \ + adcs x16, x16, x19; \ + adc x17, x17, x19; \ + subs x22, x2, x5; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x9, x6; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x13, x13, x21; \ + eor x20, x20, x19; \ + adcs x14, x14, x20; \ + adcs x15, x15, x19; \ + adcs x16, x16, x19; \ + adc x17, x17, x19; \ + subs x22, x3, x4; \ + cneg x22, x22, lo; \ + csetm x19, lo; \ + subs x20, x8, x7; \ + cneg x20, x20, lo; \ + mul x21, x22, x20; \ + umulh x20, x22, x20; \ + cinv x19, x19, lo; \ + cmn x19, #1; \ + eor x21, x21, x19; \ + adcs x13, x13, x21; \ + eor x20, x20, x19; \ + adcs x14, x14, x20; \ + adcs x15, x15, x19; \ + adcs x16, x16, x19; \ + adc x17, x17, x19; \ + ldp x21, x20, [P0]; \ + extr x2, x15, x14, #8; \ + adds x2, x2, x21; \ + extr x3, x16, x15, #8; \ + adcs x3, x3, x20; \ + ldp x21, x20, [P0+16]; \ + extr x4, x17, x16, #8; \ + adcs x4, x4, x21; \ + and x22, x3, x4; \ + lsr x5, x17, #8; \ + adcs x5, x5, x20; \ + and x22, x22, x5; \ + ldp x21, x20, [P0+32]; \ + lsl x6, x10, #1; \ + adcs x6, x6, x21; \ + and x22, x22, x6; \ + extr x7, x11, x10, #63; \ + adcs x7, x7, x20; \ + and x22, x22, x7; \ + ldp x21, x20, [P0+48]; \ + extr x8, x12, x11, #63; \ + adcs x8, x8, x21; \ + and x22, x22, x8; \ + extr x9, x13, x12, #63; \ + adcs x9, x9, x20; \ + and x22, x22, x9; \ + ldr x21, [P0+64]; \ + extr x10, x14, x13, #63; \ + and x10, x10, #0x1ff; \ + adc x10, x21, x10; \ + lsr x20, x10, #9; \ + orr x10, x10, #0xfffffffffffffe00; \ cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] + adcs xzr, x2, x20; \ + adcs xzr, x22, xzr; \ + adcs xzr, x10, xzr; \ + adcs x2, x2, x20; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adc x10, x10, xzr; \ + and x10, x10, #0x1ff; \ + stp x2, x3, [P0]; \ + stp x4, x5, [P0+16]; \ + stp x6, x7, [P0+32]; \ + stp x8, x9, [P0+48]; \ + str x10, [P0+64] // Corresponds exactly to bignum_add_p521 @@ -751,372 +1187,6 @@ stp x11, x12, [P0+48]; \ str x13, [P0+64] -// Weak multiplication not fully reducing - -#define weakmul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adds x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - and x13, x24, #0x1ff; \ - lsr x14, x21, #9; \ - adc x13, x13, x14; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - // P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) #define cmsub_p521(P0,C,P1,D,P2) \ @@ -1411,7 +1481,7 @@ S2N_BN_SYMBOL(p521_jdouble): add_p521(t1,y_1,z_1) sqr_p521(x4p,x2p) - weakmul_p521(xy2,x_1,y2) + mul_p521(xy2,x_1,y2) // t2 = (y + z)^2 @@ -1431,7 +1501,7 @@ S2N_BN_SYMBOL(p521_jdouble): // dx2 = d * x2p sub_p521(z_3,t1,y2) - weakmul_p521(dx2,d,x2p) + mul_p521(dx2,d,x2p) // x' = 4 * xy2 - d diff --git a/arm/p521/p521_jdouble_alt.S b/arm/p521/p521_jdouble_alt.S new file mode 100644 index 0000000000..fa61dcf8d9 --- /dev/null +++ b/arm/p521/p521_jdouble_alt.S @@ -0,0 +1,1458 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble_alt +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + cmp xzr, xzr; \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adcs x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + adcs x11, x11, x4; \ + adcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + adc x13, x13, x4; \ + subs x4, x13, #512; \ + csetm x4, hs; \ + sbcs x5, x5, xzr; \ + and x4, x4, #0x200; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, x4; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adds x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + and x13, x24, #0x1ff; \ + lsr x14, x21, #9; \ + adc x13, x13, x14; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + ldp x6, x7, [P1]; \ + mov x1, #(C); \ + mul x3, x1, x6; \ + mul x4, x1, x7; \ + umulh x6, x1, x6; \ + adds x4, x4, x6; \ + umulh x7, x1, x7; \ + ldp x8, x9, [P1+16]; \ + mul x5, x1, x8; \ + mul x6, x1, x9; \ + umulh x8, x1, x8; \ + adcs x5, x5, x7; \ + umulh x9, x1, x9; \ + adcs x6, x6, x8; \ + ldp x10, x11, [P1+32]; \ + mul x7, x1, x10; \ + mul x8, x1, x11; \ + umulh x10, x1, x10; \ + adcs x7, x7, x9; \ + umulh x11, x1, x11; \ + adcs x8, x8, x10; \ + ldp x12, x13, [P1+48]; \ + mul x9, x1, x12; \ + mul x10, x1, x13; \ + umulh x12, x1, x12; \ + adcs x9, x9, x11; \ + umulh x13, x1, x13; \ + adcs x10, x10, x12; \ + ldr x14, [P1+64]; \ + mul x11, x1, x14; \ + adc x11, x11, x13; \ + mov x1, #(D); \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + mul x0, x1, x20; \ + umulh x20, x1, x20; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + mul x0, x1, x21; \ + umulh x21, x1, x21; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + mul x0, x1, x22; \ + umulh x22, x1, x22; \ + adcs x5, x5, x0; \ + mvn x23, x23; \ + mul x0, x1, x23; \ + umulh x23, x1, x23; \ + adcs x6, x6, x0; \ + ldp x17, x19, [P2+32]; \ + mvn x17, x17; \ + mul x0, x1, x17; \ + umulh x17, x1, x17; \ + adcs x7, x7, x0; \ + mvn x19, x19; \ + mul x0, x1, x19; \ + umulh x19, x1, x19; \ + adcs x8, x8, x0; \ + ldp x2, x16, [P2+48]; \ + mvn x2, x2; \ + mul x0, x1, x2; \ + umulh x2, x1, x2; \ + adcs x9, x9, x0; \ + mvn x16, x16; \ + mul x0, x1, x16; \ + umulh x16, x1, x16; \ + adcs x10, x10, x0; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + mul x0, x1, x0; \ + adc x11, x11, x0; \ + adds x4, x4, x20; \ + adcs x5, x5, x21; \ + and x15, x4, x5; \ + adcs x6, x6, x22; \ + and x15, x15, x6; \ + adcs x7, x7, x23; \ + and x15, x15, x7; \ + adcs x8, x8, x17; \ + and x15, x15, x8; \ + adcs x9, x9, x19; \ + and x15, x15, x9; \ + adcs x10, x10, x2; \ + and x15, x15, x10; \ + adc x11, x11, x16; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #1; \ + adds x3, x3, x6; \ + extr x4, x7, x6, #63; \ + adcs x4, x4, x7; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #63; \ + adcs x5, x5, x8; \ + extr x6, x9, x8, #63; \ + adcs x6, x6, x9; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #63; \ + adcs x7, x7, x10; \ + extr x8, x11, x10, #63; \ + adcs x8, x8, x11; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #63; \ + adcs x9, x9, x12; \ + extr x10, x13, x12, #63; \ + adcs x10, x10, x13; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #63; \ + adc x11, x11, x14; \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + lsl x0, x20, #3; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x5, x5, x0; \ + and x15, x4, x5; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x6, x6, x0; \ + and x15, x15, x6; \ + ldp x20, x21, [P2+32]; \ + mvn x20, x20; \ + extr x0, x20, x23, #61; \ + adcs x7, x7, x0; \ + and x15, x15, x7; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x8, x8, x0; \ + and x15, x15, x8; \ + ldp x22, x23, [P2+48]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x9, x9, x0; \ + and x15, x15, x9; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x10, x10, x0; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + extr x0, x0, x23, #61; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #2; \ + extr x4, x7, x6, #62; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #62; \ + extr x6, x9, x8, #62; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #62; \ + extr x8, x11, x10, #62; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #62; \ + extr x10, x13, x12, #62; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #62; \ + ldp x0, x1, [P2]; \ + mvn x0, x0; \ + adds x3, x3, x0; \ + sbcs x4, x4, x1; \ + ldp x0, x1, [P2+16]; \ + sbcs x5, x5, x0; \ + and x15, x4, x5; \ + sbcs x6, x6, x1; \ + and x15, x15, x6; \ + ldp x0, x1, [P2+32]; \ + sbcs x7, x7, x0; \ + and x15, x15, x7; \ + sbcs x8, x8, x1; \ + and x15, x15, x8; \ + ldp x0, x1, [P2+48]; \ + sbcs x9, x9, x0; \ + and x15, x15, x9; \ + sbcs x10, x10, x1; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +S2N_BN_SYMBOL(p521_jdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjdouble_alt.S b/x86_att/p384/p384_montjdouble_alt.S new file mode 100644 index 0000000000..8258e35267 --- /dev/null +++ b/x86_att/p384/p384_montjdouble_alt.S @@ -0,0 +1,1196 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble_alt +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1. The latter stays true +// but montsqr below modifies %rdi as well. Thus, we need +// to save %rdi and restore it before the writes to outputs. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +// Safe place for pointer to the output + +#define input_z (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt except %rsi -> %rdi + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rdi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rdi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rdi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rdi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rdi, 0x28+P0 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %ebx ; \ + andq %rbx, %rcx ; \ + xorq %rbx, %rbx ; \ + subq %rcx, %rbx ; \ + subq %rbx, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rbx, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Simplified bignum_add_p384, without carry chain suspension + +#define add_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movl $0x0, %edx ; \ + adcq %rdx, %rdx ; \ + movq $0xffffffff00000001, %rbp ; \ + addq %rbp, %rax ; \ + movl $0xffffffff, %ebp ; \ + adcq %rbp, %rcx ; \ + adcq $0x1, %r8 ; \ + adcq $0x0, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq $0x0, %r11 ; \ + adcq $0xffffffffffffffff, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + subq %rbp, %rax ; \ + movq %rax, P0 ; \ + sbbq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + movq 40+P1, %rcx ; \ + movq %rcx, %r13 ; \ + shrq $62, %rcx ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + addq $1, %rcx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq 0x20+P2, %r12 ; \ + sbbq 0x28+P2, %r13 ; \ + sbbq $0, %rcx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + movq %rcx, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + movq $0x00000000ffffffff, %r9 ; \ + subq P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq 8+P2, %r10 ; \ + movq $0xfffffffffffffffe, %r11 ; \ + sbbq 16+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 24+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 32+P2, %r13 ; \ + movq $0xffffffffffffffff, %r14 ; \ + sbbq 40+P2, %r14 ; \ + movq $D, %rcx ; \ + movq %r9, %rax ; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq %r10, %rax ; \ + xorl %r10d, %r10d ; \ + mulq %rcx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq %r11, %rax ; \ + xorl %r11d, %r11d ; \ + mulq %rcx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r12, %rax ; \ + xorl %r12d, %r12d ; \ + mulq %rcx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq %r13, %rax ; \ + xorl %r13d, %r13d ; \ + mulq %rcx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq %r14, %rax ; \ + movl $1, %r14d ; \ + mulq %rcx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbx, %rbx ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbx, %rbx ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r14; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r14, %r10 ; \ + movq %r14, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + sbbq %rdx, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + movl $0xffffffff, %ebp ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + addq %rbp, %rax ; \ + movq %rax, P0 ; \ + adcq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + adcq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + movq $0x00000000ffffffff, %r8 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xfffffffffffffffe, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 40+P2, %r13 ; \ + movq %r13, %r14 ; \ + shrq $61, %r14 ; \ + shldq $3, %r12, %r13 ; \ + shldq $3, %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + addq $1, %r14 ; \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbx, %rbx ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbx, %rbx ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r14; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r14, %r10 ; \ + movq %r14, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +S2N_BN_SYMBOL(p384_montjdouble_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables +// Save the output pointer %rdi which gets overwritten in earlier +// operations before it is used. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, input_z + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// Restore the output pointer to write to x_3, y_3 and z_3. + + movq input_z, %rdi + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jdouble_alt.S b/x86_att/p521/p521_jdouble_alt.S new file mode 100644 index 0000000000..2dc6c32120 --- /dev/null +++ b/x86_att/p521/p521_jdouble_alt.S @@ -0,0 +1,1865 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble_alt +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// This is actually where they come in anyway and they stay there. + +#define input_z %rdi +#define input_x %rsi + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+72) + +// Corresponds to bignum_mul_p521_alt except temp storage location + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rcx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rcx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rcx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rcx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 ; \ + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + stc; \ + movq P1, %rax ; \ + adcq P2, %rax ; \ + movq 0x8+P1, %rbx ; \ + adcq 0x8+P2, %rbx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + adcq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + adcq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + adcq 0x40+P2, %r14 ; \ + movq $0x200, %rdx ; \ + andq %r14, %rdx ; \ + cmpq $0x200, %rdx ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rbx ; \ + movq %rbx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq %rdx, %r14 ; \ + movq %r14, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + addq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0, %rdx ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 ; \ + movq %r13, 0x28+P0 ; \ + movq %r14, 0x30+P0 ; \ + movq %r15, 0x38+P0 ; \ + movq %rdx, 0x40+P0 + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + movq $D, %rcx ; \ + movq P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r15d, %r15d ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %ebx, %ebx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 64+P2, %rax ; \ + xorq $0x1FF, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + xorl %eax, %eax ; \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x38+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 0x40+P1, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + movq 64+P2, %rbx ; \ + xorq $0x1FF, %rbx ; \ + movq 56+P2, %r15 ; \ + notq %r15; \ + shldq $3, %r15, %rbx ; \ + movq 48+P2, %r14 ; \ + notq %r14; \ + shldq $3, %r14, %r15 ; \ + movq 40+P2, %r13 ; \ + notq %r13; \ + shldq $3, %r13, %r14 ; \ + movq 32+P2, %r12 ; \ + notq %r12; \ + shldq $3, %r12, %r13 ; \ + movq 24+P2, %r11 ; \ + notq %r11; \ + shldq $3, %r11, %r12 ; \ + movq 16+P2, %r10 ; \ + notq %r10; \ + shldq $3, %r10, %r11 ; \ + movq 8+P2, %r9 ; \ + notq %r9; \ + shldq $3, %r9, %r10 ; \ + movq P2, %r8 ; \ + notq %r8; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x38+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 0x40+P1, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + movq 64+P1, %rbx ; \ + movq 56+P1, %r15 ; \ + shldq $2, %r15, %rbx ; \ + movq 48+P1, %r14 ; \ + shldq $2, %r14, %r15 ; \ + movq 40+P1, %r13 ; \ + shldq $2, %r13, %r14 ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + movq 64+P2, %rcx ; \ + xorq $0x1FF, %rcx ; \ + movq P2, %rax ; \ + notq %rax; \ + addq %rax, %r8 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r9 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r10 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r11 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r12 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r13 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r14 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r15 ; \ + adcq %rcx, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +S2N_BN_SYMBOL(p521_jdouble_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif From d5c288c512726e49dbbd8d4a442768d16cc6df9c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 3 Apr 2024 21:02:18 -0700 Subject: [PATCH 17/24] Switch inlining to subroutines in non-alt ARM P-521 point doubling The previous version with the field operations inlined led to both a very long-running proof and rather extravagant code size. The big two field operations (modular multiplication and squaring) now call local subroutines instead, which solves both problems and seems to have minimal impact on performance. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/8c279779881e79e65c2438edb4586f67cd0184ff --- arm/p521/p521_jdouble.S | 2104 ++++++++++++++++++++------------------- 1 file changed, 1061 insertions(+), 1043 deletions(-) diff --git a/arm/p521/p521_jdouble.S b/arm/p521/p521_jdouble.S index 55a701ba92..3eb0250b33 100644 --- a/arm/p521/p521_jdouble.S +++ b/arm/p521/p521_jdouble.S @@ -28,8 +28,8 @@ // Stable homes for input arguments during main code sequence -#define input_z x26 -#define input_x x27 +#define input_z x27 +#define input_x x28 // Pointer-offset pairs for inputs and outputs @@ -61,1052 +61,21 @@ #define NSPACE (NUMSIZE*7+8) -// Corresponds exactly to bignum_mul_p521 except that the -// destination buffer P0 is used as a temporary storage, -// also swapping some load/store orders to avoid aliasing -// troubles; also x0 is used in place of x26. +// For the two "big" field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521. #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2]; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x7; \ - mul x15, x4, x8; \ - mul x16, x5, x9; \ - mul x17, x6, x10; \ - umulh x19, x3, x7; \ - adds x15, x15, x19; \ - umulh x19, x4, x8; \ - adcs x16, x16, x19; \ - umulh x19, x5, x9; \ - adcs x17, x17, x19; \ - umulh x19, x6, x10; \ - adc x19, x19, xzr; \ - adds x12, x15, x11; \ - adcs x15, x16, x15; \ - adcs x16, x17, x16; \ - adcs x17, x19, x17; \ - adc x19, xzr, x19; \ - adds x13, x15, x11; \ - adcs x14, x16, x12; \ - adcs x15, x17, x15; \ - adcs x16, x19, x16; \ - adcs x17, xzr, x17; \ - adc x19, xzr, x19; \ - subs x24, x5, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x9; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x16, x16, x23; \ - eor x22, x22, x21; \ - adcs x17, x17, x22; \ - adc x19, x19, x21; \ - subs x24, x3, x4; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x8, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x12, x12, x23; \ - eor x22, x22, x21; \ - adcs x13, x13, x22; \ - adcs x14, x14, x21; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x15, x15, x23; \ - eor x22, x22, x21; \ - adcs x16, x16, x22; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x13, x13, x23; \ - eor x22, x22, x21; \ - adcs x14, x14, x22; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - lsl x21, x11, #9; \ - extr x11, x12, x11, #55; \ - extr x12, x13, x12, #55; \ - extr x13, x14, x13, #55; \ - lsr x14, x14, #55; \ - ldp x3, x4, [P1+32]; \ - ldp x5, x6, [P1+48]; \ - ldp x7, x8, [P2+32]; \ - ldp x9, x10, [P2+48]; \ - stp x15, x16, [P0]; \ - stp x17, x19, [P0+16]; \ - stp x21, x11, [P0+32]; \ - stp x12, x13, [P0+48]; \ - str x14, [P0+64]; \ - mul x11, x3, x7; \ - mul x15, x4, x8; \ - mul x16, x5, x9; \ - mul x17, x6, x10; \ - umulh x19, x3, x7; \ - adds x15, x15, x19; \ - umulh x19, x4, x8; \ - adcs x16, x16, x19; \ - umulh x19, x5, x9; \ - adcs x17, x17, x19; \ - umulh x19, x6, x10; \ - adc x19, x19, xzr; \ - adds x12, x15, x11; \ - adcs x15, x16, x15; \ - adcs x16, x17, x16; \ - adcs x17, x19, x17; \ - adc x19, xzr, x19; \ - adds x13, x15, x11; \ - adcs x14, x16, x12; \ - adcs x15, x17, x15; \ - adcs x16, x19, x16; \ - adcs x17, xzr, x17; \ - adc x19, xzr, x19; \ - subs x24, x5, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x9; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x16, x16, x23; \ - eor x22, x22, x21; \ - adcs x17, x17, x22; \ - adc x19, x19, x21; \ - subs x24, x3, x4; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x8, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x12, x12, x23; \ - eor x22, x22, x21; \ - adcs x13, x13, x22; \ - adcs x14, x14, x21; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x15, x15, x23; \ - eor x22, x22, x21; \ - adcs x16, x16, x22; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x13, x13, x23; \ - eor x22, x22, x21; \ - adcs x14, x14, x22; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - ldp x23, x22, [P0]; \ - adds x11, x11, x23; \ - adcs x12, x12, x22; \ - stp x11, x12, [P0]; \ - ldp x23, x22, [P0+16]; \ - adcs x13, x13, x23; \ - adcs x14, x14, x22; \ - stp x13, x14, [P0+16]; \ - ldp x23, x22, [P0+32]; \ - adcs x15, x15, x23; \ - adcs x16, x16, x22; \ - stp x15, x16, [P0+32]; \ - ldp x23, x22, [P0+48]; \ - adcs x17, x17, x23; \ - adcs x19, x19, x22; \ - stp x17, x19, [P0+48]; \ - ldr x21, [P0+64]; \ - adc x21, x21, xzr; \ - str x21, [P0+64]; \ - ldp x23, x22, [P1]; \ - subs x3, x3, x23; \ - sbcs x4, x4, x22; \ - ldp x23, x22, [P1+16]; \ - sbcs x5, x5, x23; \ - sbcs x6, x6, x22; \ - csetm x24, lo; \ - ldp x23, x22, [P2]; \ - subs x7, x23, x7; \ - sbcs x8, x22, x8; \ - ldp x23, x22, [P2+16]; \ - sbcs x9, x23, x9; \ - sbcs x10, x22, x10; \ - csetm x25, lo; \ - eor x3, x3, x24; \ - subs x3, x3, x24; \ - eor x4, x4, x24; \ - sbcs x4, x4, x24; \ - eor x5, x5, x24; \ - sbcs x5, x5, x24; \ - eor x6, x6, x24; \ - sbc x6, x6, x24; \ - eor x7, x7, x25; \ - subs x7, x7, x25; \ - eor x8, x8, x25; \ - sbcs x8, x8, x25; \ - eor x9, x9, x25; \ - sbcs x9, x9, x25; \ - eor x10, x10, x25; \ - sbc x10, x10, x25; \ - eor x25, x25, x24; \ - mul x11, x3, x7; \ - mul x15, x4, x8; \ - mul x16, x5, x9; \ - mul x17, x6, x10; \ - umulh x19, x3, x7; \ - adds x15, x15, x19; \ - umulh x19, x4, x8; \ - adcs x16, x16, x19; \ - umulh x19, x5, x9; \ - adcs x17, x17, x19; \ - umulh x19, x6, x10; \ - adc x19, x19, xzr; \ - adds x12, x15, x11; \ - adcs x15, x16, x15; \ - adcs x16, x17, x16; \ - adcs x17, x19, x17; \ - adc x19, xzr, x19; \ - adds x13, x15, x11; \ - adcs x14, x16, x12; \ - adcs x15, x17, x15; \ - adcs x16, x19, x16; \ - adcs x17, xzr, x17; \ - adc x19, xzr, x19; \ - subs x24, x5, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x9; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x16, x16, x23; \ - eor x22, x22, x21; \ - adcs x17, x17, x22; \ - adc x19, x19, x21; \ - subs x24, x3, x4; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x8, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x12, x12, x23; \ - eor x22, x22, x21; \ - adcs x13, x13, x22; \ - adcs x14, x14, x21; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x15, x15, x23; \ - eor x22, x22, x21; \ - adcs x16, x16, x22; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x13, x13, x23; \ - eor x22, x22, x21; \ - adcs x14, x14, x22; \ - adcs x15, x15, x21; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x3, x6; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x10, x7; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - subs x24, x4, x5; \ - cneg x24, x24, lo; \ - csetm x21, lo; \ - subs x22, x9, x8; \ - cneg x22, x22, lo; \ - mul x23, x24, x22; \ - umulh x22, x24, x22; \ - cinv x21, x21, lo; \ - cmn x21, #1; \ - eor x23, x23, x21; \ - adcs x14, x14, x23; \ - eor x22, x22, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x21; \ - adcs x17, x17, x21; \ - adc x19, x19, x21; \ - ldp x3, x4, [P0]; \ - ldp x5, x6, [P0+16]; \ - eor x11, x11, x25; \ - adds x11, x11, x3; \ - eor x12, x12, x25; \ - adcs x12, x12, x4; \ - eor x13, x13, x25; \ - adcs x13, x13, x5; \ - eor x14, x14, x25; \ - adcs x14, x14, x6; \ - eor x15, x15, x25; \ - ldp x7, x8, [P0+32]; \ - ldp x9, x10, [P0+48]; \ - ldr x20, [P0+64]; \ - adcs x15, x15, x7; \ - eor x16, x16, x25; \ - adcs x16, x16, x8; \ - eor x17, x17, x25; \ - adcs x17, x17, x9; \ - eor x19, x19, x25; \ - adcs x19, x19, x10; \ - adc x21, x20, xzr; \ - adds x15, x15, x3; \ - adcs x16, x16, x4; \ - adcs x17, x17, x5; \ - adcs x19, x19, x6; \ - and x25, x25, #0x1ff; \ - lsl x24, x11, #9; \ - orr x24, x24, x25; \ - adcs x7, x7, x24; \ - extr x24, x12, x11, #55; \ - adcs x8, x8, x24; \ - extr x24, x13, x12, #55; \ - adcs x9, x9, x24; \ - extr x24, x14, x13, #55; \ - adcs x10, x10, x24; \ - lsr x24, x14, #55; \ - adc x20, x24, x20; \ - ldr x6, [P2+64]; \ - ldp x3, x4, [P1]; \ - and x23, x3, #0xfffffffffffff; \ - mul x23, x6, x23; \ - ldr x14, [P1+64]; \ - ldp x11, x12, [P2]; \ - and x24, x11, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x23, x23, x24; \ - extr x24, x4, x3, #52; \ - and x24, x24, #0xfffffffffffff; \ - mul x22, x6, x24; \ - extr x24, x12, x11, #52; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x22, x22, x24; \ - lsr x24, x23, #52; \ - add x22, x22, x24; \ - lsl x23, x23, #12; \ - extr x24, x22, x23, #12; \ - adds x15, x15, x24; \ - ldp x5, x3, [P1+16]; \ - ldp x13, x11, [P2+16]; \ - extr x24, x5, x4, #40; \ - and x24, x24, #0xfffffffffffff; \ - mul x23, x6, x24; \ - extr x24, x13, x12, #40; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x23, x23, x24; \ - lsr x24, x22, #52; \ - add x23, x23, x24; \ - lsl x22, x22, #12; \ - extr x24, x23, x22, #24; \ - adcs x16, x16, x24; \ - extr x24, x3, x5, #28; \ - and x24, x24, #0xfffffffffffff; \ - mul x22, x6, x24; \ - extr x24, x11, x13, #28; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x22, x22, x24; \ - lsr x24, x23, #52; \ - add x22, x22, x24; \ - lsl x23, x23, #12; \ - extr x24, x22, x23, #36; \ - adcs x17, x17, x24; \ - and x0, x16, x17; \ - ldp x4, x5, [P1+32]; \ - ldp x12, x13, [P2+32]; \ - extr x24, x4, x3, #16; \ - and x24, x24, #0xfffffffffffff; \ - mul x23, x6, x24; \ - extr x24, x12, x11, #16; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x23, x23, x24; \ - lsl x21, x21, #48; \ - add x23, x23, x21; \ - lsr x24, x22, #52; \ - add x23, x23, x24; \ - lsl x22, x22, #12; \ - extr x24, x23, x22, #48; \ - adcs x19, x19, x24; \ - and x0, x0, x19; \ - lsr x24, x4, #4; \ - and x24, x24, #0xfffffffffffff; \ - mul x22, x6, x24; \ - lsr x24, x12, #4; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x22, x22, x24; \ - lsr x24, x23, #52; \ - add x22, x22, x24; \ - lsl x23, x23, #12; \ - extr x25, x22, x23, #60; \ - extr x24, x5, x4, #56; \ - and x24, x24, #0xfffffffffffff; \ - mul x23, x6, x24; \ - extr x24, x13, x12, #56; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x23, x23, x24; \ - lsr x24, x22, #52; \ - add x23, x23, x24; \ - lsl x25, x25, #8; \ - extr x24, x23, x25, #8; \ - adcs x7, x7, x24; \ - and x0, x0, x7; \ - ldp x3, x4, [P1+48]; \ - ldp x11, x12, [P2+48]; \ - extr x24, x3, x5, #44; \ - and x24, x24, #0xfffffffffffff; \ - mul x22, x6, x24; \ - extr x24, x11, x13, #44; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x22, x22, x24; \ - lsr x24, x23, #52; \ - add x22, x22, x24; \ - lsl x23, x23, #12; \ - extr x24, x22, x23, #20; \ - adcs x8, x8, x24; \ - and x0, x0, x8; \ - extr x24, x4, x3, #32; \ - and x24, x24, #0xfffffffffffff; \ - mul x23, x6, x24; \ - extr x24, x12, x11, #32; \ - and x24, x24, #0xfffffffffffff; \ - mul x24, x14, x24; \ - add x23, x23, x24; \ - lsr x24, x22, #52; \ - add x23, x23, x24; \ - lsl x22, x22, #12; \ - extr x24, x23, x22, #32; \ - adcs x9, x9, x24; \ - and x0, x0, x9; \ - lsr x24, x4, #20; \ - mul x22, x6, x24; \ - lsr x24, x12, #20; \ - mul x24, x14, x24; \ - add x22, x22, x24; \ - lsr x24, x23, #52; \ - add x22, x22, x24; \ - lsl x23, x23, #12; \ - extr x24, x22, x23, #44; \ - adcs x10, x10, x24; \ - and x0, x0, x10; \ - mul x24, x6, x14; \ - lsr x22, x22, #44; \ - add x24, x24, x22; \ - adc x20, x20, x24; \ - lsr x22, x20, #9; \ - orr x20, x20, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x15, x22; \ - adcs xzr, x0, xzr; \ - adcs xzr, x20, xzr; \ - adcs x15, x15, x22; \ - adcs x16, x16, xzr; \ - adcs x17, x17, xzr; \ - adcs x19, x19, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x20, x20, xzr; \ - and x22, x15, #0x1ff; \ - extr x15, x16, x15, #9; \ - extr x16, x17, x16, #9; \ - stp x15, x16, [P0]; \ - extr x17, x19, x17, #9; \ - extr x19, x7, x19, #9; \ - stp x17, x19, [P0+16]; \ - extr x7, x8, x7, #9; \ - extr x8, x9, x8, #9; \ - stp x7, x8, [P0+32]; \ - extr x9, x10, x9, #9; \ - extr x10, x20, x10, #9; \ - stp x9, x10, [P0+48]; \ - str x22, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 -// Corresponds exactly to bignum_sqr_p521 +// Call local code equivalent to bignum_sqr_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - ldp x4, x5, [P1+16]; \ - ldp x6, x7, [P1+32]; \ - ldp x8, x9, [P1+48]; \ - mul x12, x6, x8; \ - mul x17, x7, x9; \ - umulh x22, x6, x8; \ - subs x23, x6, x7; \ - cneg x23, x23, lo; \ - csetm x11, lo; \ - subs x10, x9, x8; \ - cneg x10, x10, lo; \ - mul x16, x23, x10; \ - umulh x10, x23, x10; \ - cinv x11, x11, lo; \ - eor x16, x16, x11; \ - eor x10, x10, x11; \ - adds x13, x12, x22; \ - adc x22, x22, xzr; \ - umulh x23, x7, x9; \ - adds x13, x13, x17; \ - adcs x22, x22, x23; \ - adc x23, x23, xzr; \ - adds x22, x22, x17; \ - adc x23, x23, xzr; \ - cmn x11, #1; \ - adcs x13, x13, x16; \ - adcs x22, x22, x10; \ - adc x23, x23, x11; \ - adds x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adc x19, xzr, xzr; \ - mul x10, x6, x6; \ - mul x16, x7, x7; \ - mul x21, x6, x7; \ - umulh x11, x6, x6; \ - umulh x17, x7, x7; \ - umulh x20, x6, x7; \ - adds x11, x11, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x11, x11, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x12, x12, x16; \ - adcs x13, x13, x17; \ - adcs x22, x22, xzr; \ - adcs x23, x23, xzr; \ - adc x19, x19, xzr; \ - mul x14, x8, x8; \ - mul x16, x9, x9; \ - mul x21, x8, x9; \ - umulh x15, x8, x8; \ - umulh x17, x9, x9; \ - umulh x20, x8, x9; \ - adds x15, x15, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x15, x15, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x14, x14, x22; \ - adcs x15, x15, x23; \ - adcs x16, x16, x19; \ - adc x17, x17, xzr; \ - ldr x19, [P1+64]; \ - add x23, x19, x19; \ - mul x19, x19, x19; \ - and x21, x2, #0xfffffffffffff; \ - mul x21, x23, x21; \ - extr x20, x3, x2, #52; \ - and x20, x20, #0xfffffffffffff; \ - mul x20, x23, x20; \ - lsr x22, x21, #52; \ - add x20, x20, x22; \ - lsl x21, x21, #12; \ - extr x22, x20, x21, #12; \ - adds x10, x10, x22; \ - extr x21, x4, x3, #40; \ - and x21, x21, #0xfffffffffffff; \ - mul x21, x23, x21; \ - lsr x22, x20, #52; \ - add x21, x21, x22; \ - lsl x20, x20, #12; \ - extr x22, x21, x20, #24; \ - adcs x11, x11, x22; \ - extr x20, x5, x4, #28; \ - and x20, x20, #0xfffffffffffff; \ - mul x20, x23, x20; \ - lsr x22, x21, #52; \ - add x20, x20, x22; \ - lsl x21, x21, #12; \ - extr x22, x20, x21, #36; \ - adcs x12, x12, x22; \ - extr x21, x6, x5, #16; \ - and x21, x21, #0xfffffffffffff; \ - mul x21, x23, x21; \ - lsr x22, x20, #52; \ - add x21, x21, x22; \ - lsl x20, x20, #12; \ - extr x22, x21, x20, #48; \ - adcs x13, x13, x22; \ - lsr x20, x6, #4; \ - and x20, x20, #0xfffffffffffff; \ - mul x20, x23, x20; \ - lsr x22, x21, #52; \ - add x20, x20, x22; \ - lsl x21, x21, #12; \ - extr x24, x20, x21, #60; \ - extr x21, x7, x6, #56; \ - and x21, x21, #0xfffffffffffff; \ - mul x21, x23, x21; \ - lsr x22, x20, #52; \ - add x21, x21, x22; \ - lsl x24, x24, #8; \ - extr x22, x21, x24, #8; \ - adcs x14, x14, x22; \ - extr x20, x8, x7, #44; \ - and x20, x20, #0xfffffffffffff; \ - mul x20, x23, x20; \ - lsr x22, x21, #52; \ - add x20, x20, x22; \ - lsl x21, x21, #12; \ - extr x22, x20, x21, #20; \ - adcs x15, x15, x22; \ - extr x21, x9, x8, #32; \ - and x21, x21, #0xfffffffffffff; \ - mul x21, x23, x21; \ - lsr x22, x20, #52; \ - add x21, x21, x22; \ - lsl x20, x20, #12; \ - extr x22, x21, x20, #32; \ - adcs x16, x16, x22; \ - lsr x20, x9, #20; \ - mul x20, x23, x20; \ - lsr x22, x21, #52; \ - add x20, x20, x22; \ - lsl x21, x21, #12; \ - extr x22, x20, x21, #44; \ - adcs x17, x17, x22; \ - lsr x20, x20, #44; \ - adc x19, x19, x20; \ - extr x21, x11, x10, #9; \ - extr x20, x12, x11, #9; \ - stp x21, x20, [P0]; \ - extr x21, x13, x12, #9; \ - extr x20, x14, x13, #9; \ - stp x21, x20, [P0+16]; \ - extr x21, x15, x14, #9; \ - extr x20, x16, x15, #9; \ - stp x21, x20, [P0+32]; \ - extr x21, x17, x16, #9; \ - extr x20, x19, x17, #9; \ - stp x21, x20, [P0+48]; \ - and x22, x10, #0x1ff; \ - lsr x19, x19, #9; \ - add x22, x22, x19; \ - str x22, [P0+64]; \ - mul x12, x2, x4; \ - mul x17, x3, x5; \ - umulh x22, x2, x4; \ - subs x23, x2, x3; \ - cneg x23, x23, lo; \ - csetm x11, lo; \ - subs x10, x5, x4; \ - cneg x10, x10, lo; \ - mul x16, x23, x10; \ - umulh x10, x23, x10; \ - cinv x11, x11, lo; \ - eor x16, x16, x11; \ - eor x10, x10, x11; \ - adds x13, x12, x22; \ - adc x22, x22, xzr; \ - umulh x23, x3, x5; \ - adds x13, x13, x17; \ - adcs x22, x22, x23; \ - adc x23, x23, xzr; \ - adds x22, x22, x17; \ - adc x23, x23, xzr; \ - cmn x11, #1; \ - adcs x13, x13, x16; \ - adcs x22, x22, x10; \ - adc x23, x23, x11; \ - adds x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adc x19, xzr, xzr; \ - mul x10, x2, x2; \ - mul x16, x3, x3; \ - mul x21, x2, x3; \ - umulh x11, x2, x2; \ - umulh x17, x3, x3; \ - umulh x20, x2, x3; \ - adds x11, x11, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x11, x11, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x12, x12, x16; \ - adcs x13, x13, x17; \ - adcs x22, x22, xzr; \ - adcs x23, x23, xzr; \ - adc x19, x19, xzr; \ - mul x14, x4, x4; \ - mul x16, x5, x5; \ - mul x21, x4, x5; \ - umulh x15, x4, x4; \ - umulh x17, x5, x5; \ - umulh x20, x4, x5; \ - adds x15, x15, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x15, x15, x21; \ - adcs x16, x16, x20; \ - adc x17, x17, xzr; \ - adds x14, x14, x22; \ - adcs x15, x15, x23; \ - adcs x16, x16, x19; \ - adc x17, x17, xzr; \ - ldp x21, x20, [P0]; \ - adds x21, x21, x10; \ - adcs x20, x20, x11; \ - stp x21, x20, [P0]; \ - ldp x21, x20, [P0+16]; \ - adcs x21, x21, x12; \ - adcs x20, x20, x13; \ - stp x21, x20, [P0+16]; \ - ldp x21, x20, [P0+32]; \ - adcs x21, x21, x14; \ - adcs x20, x20, x15; \ - stp x21, x20, [P0+32]; \ - ldp x21, x20, [P0+48]; \ - adcs x21, x21, x16; \ - adcs x20, x20, x17; \ - stp x21, x20, [P0+48]; \ - ldr x22, [P0+64]; \ - adc x22, x22, xzr; \ - str x22, [P0+64]; \ - mul x10, x2, x6; \ - mul x14, x3, x7; \ - mul x15, x4, x8; \ - mul x16, x5, x9; \ - umulh x17, x2, x6; \ - adds x14, x14, x17; \ - umulh x17, x3, x7; \ - adcs x15, x15, x17; \ - umulh x17, x4, x8; \ - adcs x16, x16, x17; \ - umulh x17, x5, x9; \ - adc x17, x17, xzr; \ - adds x11, x14, x10; \ - adcs x14, x15, x14; \ - adcs x15, x16, x15; \ - adcs x16, x17, x16; \ - adc x17, xzr, x17; \ - adds x12, x14, x10; \ - adcs x13, x15, x11; \ - adcs x14, x16, x14; \ - adcs x15, x17, x15; \ - adcs x16, xzr, x16; \ - adc x17, xzr, x17; \ - subs x22, x4, x5; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x9, x8; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x15, x15, x21; \ - eor x20, x20, x19; \ - adcs x16, x16, x20; \ - adc x17, x17, x19; \ - subs x22, x2, x3; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x7, x6; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x11, x11, x21; \ - eor x20, x20, x19; \ - adcs x12, x12, x20; \ - adcs x13, x13, x19; \ - adcs x14, x14, x19; \ - adcs x15, x15, x19; \ - adcs x16, x16, x19; \ - adc x17, x17, x19; \ - subs x22, x3, x5; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x9, x7; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x14, x14, x21; \ - eor x20, x20, x19; \ - adcs x15, x15, x20; \ - adcs x16, x16, x19; \ - adc x17, x17, x19; \ - subs x22, x2, x4; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x8, x6; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x12, x12, x21; \ - eor x20, x20, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x19; \ - adcs x15, x15, x19; \ - adcs x16, x16, x19; \ - adc x17, x17, x19; \ - subs x22, x2, x5; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x9, x6; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x13, x13, x21; \ - eor x20, x20, x19; \ - adcs x14, x14, x20; \ - adcs x15, x15, x19; \ - adcs x16, x16, x19; \ - adc x17, x17, x19; \ - subs x22, x3, x4; \ - cneg x22, x22, lo; \ - csetm x19, lo; \ - subs x20, x8, x7; \ - cneg x20, x20, lo; \ - mul x21, x22, x20; \ - umulh x20, x22, x20; \ - cinv x19, x19, lo; \ - cmn x19, #1; \ - eor x21, x21, x19; \ - adcs x13, x13, x21; \ - eor x20, x20, x19; \ - adcs x14, x14, x20; \ - adcs x15, x15, x19; \ - adcs x16, x16, x19; \ - adc x17, x17, x19; \ - ldp x21, x20, [P0]; \ - extr x2, x15, x14, #8; \ - adds x2, x2, x21; \ - extr x3, x16, x15, #8; \ - adcs x3, x3, x20; \ - ldp x21, x20, [P0+16]; \ - extr x4, x17, x16, #8; \ - adcs x4, x4, x21; \ - and x22, x3, x4; \ - lsr x5, x17, #8; \ - adcs x5, x5, x20; \ - and x22, x22, x5; \ - ldp x21, x20, [P0+32]; \ - lsl x6, x10, #1; \ - adcs x6, x6, x21; \ - and x22, x22, x6; \ - extr x7, x11, x10, #63; \ - adcs x7, x7, x20; \ - and x22, x22, x7; \ - ldp x21, x20, [P0+48]; \ - extr x8, x12, x11, #63; \ - adcs x8, x8, x21; \ - and x22, x22, x8; \ - extr x9, x13, x12, #63; \ - adcs x9, x9, x20; \ - and x22, x22, x9; \ - ldr x21, [P0+64]; \ - extr x10, x14, x13, #63; \ - and x10, x10, #0x1ff; \ - adc x10, x21, x10; \ - lsr x20, x10, #9; \ - orr x10, x10, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x2, x20; \ - adcs xzr, x22, xzr; \ - adcs xzr, x10, xzr; \ - adcs x2, x2, x20; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adc x10, x10, xzr; \ - and x10, x10, #0x1ff; \ - stp x2, x3, [P0]; \ - stp x4, x5, [P0+16]; \ - stp x6, x7, [P0+32]; \ - stp x8, x9, [P0+48]; \ - str x10, [P0+64] + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 // Corresponds exactly to bignum_add_p521 @@ -1454,6 +423,7 @@ S2N_BN_SYMBOL(p521_jdouble): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -1515,12 +485,1060 @@ S2N_BN_SYMBOL(p521_jdouble): add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the two "big" field operations, almost identical to +// bignum_mul_p521 and bignum_sqr_p521 except for avoiding the intial +// register save-restore, and in the case of local_mul_p521, using the +// output buffer as temporary storage, slightly reordering a few loads +// and stores to make it aliasing-proof. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [x0] + stp x17, x19, [x0, #16] + stp x21, x11, [x0, #32] + stp x12, x13, [x0, #48] + str x14, [x0, #64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [x0] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [x0] + ldp x23, x22, [x0, #16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [x0, #16] + ldp x23, x22, [x0, #32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [x0, #32] + ldp x23, x22, [x0, #48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [x0, #48] + ldr x21, [x0, #64] + adc x21, x21, xzr + str x21, [x0, #64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [x0] + ldp x5, x6, [x0, #16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [x0, #32] + ldp x9, x10, [x0, #48] + ldr x20, [x0, #64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x26, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x26, x26, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x25, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x25, x25, #8 + extr x24, x23, x25, #8 + adcs x7, x7, x24 + and x26, x26, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x26, x26, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x26, x26, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x26, x26, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x26, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) From ee7f834b16d3be2193406a17e39e834086763a02 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 19 Apr 2024 23:46:40 -0700 Subject: [PATCH 18/24] Liberalize aliasing requirements for Weierstrass point additions This is done simply by accumulating the final values first in local stack variables (re-using fields already used for earlier intermediates, so not changing total stack usage) and then copying these values to the output buffers right at the end. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/f06ec79a1ec40ae7c88a3c5fc3d5cb0293ca33c8 --- arm/p384/p384_montjadd.S | 36 +++++++++++++++++++++---- arm/p384/p384_montjmixadd.S | 36 +++++++++++++++++++++---- arm/p521/p521_jadd.S | 48 +++++++++++++++++++++++++++++---- arm/p521/p521_jmixadd.S | 48 +++++++++++++++++++++++++++++---- x86_att/p384/p384_montjadd.S | 33 +++++++++++++++++++---- x86_att/p384/p384_montjmixadd.S | 33 +++++++++++++++++++---- x86_att/p521/p521_jadd.S | 39 +++++++++++++++++++++++---- x86_att/p521/p521_jmixadd.S | 39 +++++++++++++++++++++++---- 8 files changed, 272 insertions(+), 40 deletions(-) diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S index 9c0e1ecb99..dc5893f0f2 100644 --- a/arm/p384/p384_montjadd.S +++ b/arm/p384/p384_montjadd.S @@ -49,6 +49,7 @@ #define z1sq sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -62,9 +63,11 @@ #define t2 sp, #(NUMSIZE*4) #define x1a sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) #define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define y1a sp, #(NUMSIZE*6) @@ -850,20 +853,43 @@ S2N_BN_SYMBOL(p384_montjadd): montmul_p384(zzx1,zz,x1a) montmul_p384(zzx2,zz,x2a) - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) montmul_p384(xd,xd,z_1) - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y1a) - montmul_p384(z_3,xd,z_2) + montmul_p384(resz,xd,z_2) montmul_p384(t2,yd,t2) - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Copy from staging area to actual outputs + + ldp x0, x1, [resx] + ldp x2, x3, [resx+16] + ldp x4, x5, [resx+32] + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + + ldp x0, x1, [resy] + ldp x2, x3, [resy+16] + ldp x4, x5, [resy+32] + stp x0, x1, [y_3] + stp x2, x3, [y_3+16] + stp x4, x5, [y_3+32] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] // Restore stack and registers diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S index 1b0165ab8c..7082533ef0 100644 --- a/arm/p384/p384_montjmixadd.S +++ b/arm/p384/p384_montjmixadd.S @@ -50,6 +50,7 @@ #define zp2 sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -62,8 +63,10 @@ #define t2 sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define NSPACE (NUMSIZE*6) @@ -842,19 +845,42 @@ S2N_BN_SYMBOL(p384_montjmixadd): montmul_p384(zzx1,zz,x_1) montmul_p384(zzx2,zz,x2a) - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) - montmul_p384(z_3,xd,z_1) + montmul_p384(resz,xd,z_1) - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y_1) montmul_p384(t2,yd,t2) - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Copy from staging area to actual outputs + + ldp x0, x1, [resx] + ldp x2, x3, [resx+16] + ldp x4, x5, [resx+32] + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + + ldp x0, x1, [resy] + ldp x2, x3, [resy+16] + ldp x4, x5, [resy+32] + stp x0, x1, [y_3] + stp x2, x3, [y_3+16] + stp x4, x5, [y_3+32] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] // Restore stack and registers diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S index 1d6b196c8c..0aa55a2385 100644 --- a/arm/p521/p521_jadd.S +++ b/arm/p521/p521_jadd.S @@ -53,6 +53,7 @@ #define z1sq sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -66,9 +67,11 @@ #define t2 sp, #(NUMSIZE*4) #define x1a sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) #define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define y1a sp, #(NUMSIZE*6) @@ -764,20 +767,55 @@ S2N_BN_SYMBOL(p521_jadd): mul_p521(zzx1,zz,x1a) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) mul_p521(xd,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y1a) - mul_p521(z_3,xd,z_2) + mul_p521(resz,xd,z_2) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Copy from staging area to actual outputs + + ldp x0, x1, [resx] + ldp x2, x3, [resx+16] + ldp x4, x5, [resx+32] + ldp x6, x7, [resx+48] + ldr x8, [resx+64] + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resy] + ldp x2, x3, [resy+16] + ldp x4, x5, [resy+32] + ldp x6, x7, [resy+48] + ldr x8, [resy+64] + stp x0, x1, [y_3] + stp x2, x3, [y_3+16] + stp x4, x5, [y_3+32] + stp x6, x7, [y_3+48] + str x8, [y_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] // Restore stack and registers diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S index c9b62a9aa1..5bc5a0c5af 100644 --- a/arm/p521/p521_jmixadd.S +++ b/arm/p521/p521_jmixadd.S @@ -54,6 +54,7 @@ #define zp2 sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -66,8 +67,10 @@ #define t2 sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define NSPACE (NUMSIZE*6) @@ -754,19 +757,54 @@ S2N_BN_SYMBOL(p521_jmixadd): mul_p521(zzx1,zz,x_1) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) - mul_p521(z_3,xd,z_1) + mul_p521(resz,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y_1) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Copy from staging area to actual outputs + + ldp x0, x1, [resx] + ldp x2, x3, [resx+16] + ldp x4, x5, [resx+32] + ldp x6, x7, [resx+48] + ldr x8, [resx+64] + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resy] + ldp x2, x3, [resy+16] + ldp x4, x5, [resy+32] + ldp x6, x7, [resy+48] + ldr x8, [resy+64] + stp x0, x1, [y_3] + stp x2, x3, [y_3+16] + stp x4, x5, [y_3+32] + stp x6, x7, [y_3+48] + str x8, [y_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] // Restore stack and registers diff --git a/x86_att/p384/p384_montjadd.S b/x86_att/p384/p384_montjadd.S index 27b58bfc14..2312ec8edd 100644 --- a/x86_att/p384/p384_montjadd.S +++ b/x86_att/p384/p384_montjadd.S @@ -52,6 +52,7 @@ #define z1sq (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -65,9 +66,11 @@ #define t2 (NUMSIZE*4)(%rsp) #define x1a (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) #define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define y1a (NUMSIZE*6)(%rsp) @@ -843,6 +846,20 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 +#define copy_p384(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 + S2N_BN_SYMBOL(p384_montjadd): #if WINDOWS_ABI @@ -900,27 +917,33 @@ S2N_BN_SYMBOL(p384_montjadd): montmul_p384(zzx2,zz,x2a) movq input_z, %rdi - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) movq input_x, %rsi montmul_p384(xd,xd,z_1) movq input_z, %rdi - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) movq input_z, %rdi - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y1a) movq input_z, %rdi movq input_y, %rcx - montmul_p384(z_3,xd,z_2) + montmul_p384(resz,xd,z_2) montmul_p384(t2,yd,t2) movq input_z, %rdi - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Copy from staging area to actual outputs + + copy_p384(x_3,resx) + copy_p384(y_3,resy) + copy_p384(z_3,resz) // Restore stack and registers diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S index 0d456464b9..91a90ddd59 100644 --- a/x86_att/p384/p384_montjmixadd.S +++ b/x86_att/p384/p384_montjmixadd.S @@ -48,6 +48,7 @@ #define zp2 (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -60,8 +61,10 @@ #define t2 (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) // Temporaries for the actual input pointers @@ -835,6 +838,20 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 +#define copy_p384(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 + S2N_BN_SYMBOL(p384_montjmixadd): #if WINDOWS_ABI @@ -888,25 +905,31 @@ S2N_BN_SYMBOL(p384_montjmixadd): montmul_p384(zzx2,zz,x2a) movq input_z, %rdi - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) movq input_z, %rdi movq input_x, %rsi - montmul_p384(z_3,xd,z_1) + montmul_p384(resz,xd,z_1) movq input_z, %rdi - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) movq input_z, %rdi - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) movq input_x, %rsi montmul_p384(t1,t1,y_1) montmul_p384(t2,yd,t2) movq input_z, %rdi - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Copy from staging area to actual outputs + + copy_p384(x_3,resx) + copy_p384(y_3,resy) + copy_p384(z_3,resz) // Restore stack and registers diff --git a/x86_att/p521/p521_jadd.S b/x86_att/p521/p521_jadd.S index 807a7c5472..33ef178382 100644 --- a/x86_att/p521/p521_jadd.S +++ b/x86_att/p521/p521_jadd.S @@ -55,6 +55,7 @@ #define z1sq (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -68,9 +69,11 @@ #define t2 (NUMSIZE*4)(%rsp) #define x1a (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) #define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define y1a (NUMSIZE*6)(%rsp) @@ -670,6 +673,26 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 +#define copy_p521(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + S2N_BN_SYMBOL(p521_jadd): #if WINDOWS_ABI @@ -717,20 +740,26 @@ S2N_BN_SYMBOL(p521_jadd): mul_p521(zzx1,zz,x1a) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) mul_p521(xd,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y1a) - mul_p521(z_3,xd,z_2) + mul_p521(resz,xd,z_2) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Copy from staging area to actual outputs + + copy_p521(x_3,resx) + copy_p521(y_3,resy) + copy_p521(z_3,resz) // Restore stack and registers diff --git a/x86_att/p521/p521_jmixadd.S b/x86_att/p521/p521_jmixadd.S index 702b63f560..be81c38f84 100644 --- a/x86_att/p521/p521_jmixadd.S +++ b/x86_att/p521/p521_jmixadd.S @@ -56,6 +56,7 @@ #define zp2 (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -68,8 +69,10 @@ #define t2 (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define tmp (NUMSIZE*6)(%rsp) @@ -667,6 +670,26 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 +#define copy_p521(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + S2N_BN_SYMBOL(p521_jmixadd): #if WINDOWS_ABI @@ -709,19 +732,25 @@ S2N_BN_SYMBOL(p521_jmixadd): mul_p521(zzx1,zz,x_1) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) - mul_p521(z_3,xd,z_1) + mul_p521(resz,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y_1) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Copy from staging area to actual outputs + + copy_p521(x_3,resx) + copy_p521(y_3,resy) + copy_p521(z_3,resz) // Restore stack and registers From e34ecc159df831b7f2970464850436765ae65678 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 25 Apr 2024 08:04:06 -0700 Subject: [PATCH 19/24] Handle zero inputs in Weierstrass mixed additions The code now handles specially the case where P1 = (x,y,z) is the point at infinity, i.e. has z = 0. It then returns the other point P2 augmented (since that is in affine coordinates, this being mixed addition) with z = 1 or its Montgomery equivalent to give the more desirable result 0 + P2 = P2. The selection is constant-time as usual with a single code path. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/72ccfda537acac4e7d242d2a283a84094a00ba55 --- arm/p384/p384_montjmixadd.S | 73 +++++++++++++++++----- arm/p521/p521_jmixadd.S | 83 +++++++++++++++++++++---- x86_att/p384/p384_montjmixadd.S | 93 ++++++++++++++++++++++------ x86_att/p521/p521_jmixadd.S | 105 ++++++++++++++++++++++++++++++-- 4 files changed, 305 insertions(+), 49 deletions(-) diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S index 7082533ef0..faf9fd65a9 100644 --- a/arm/p384/p384_montjmixadd.S +++ b/arm/p384/p384_montjmixadd.S @@ -859,28 +859,71 @@ S2N_BN_SYMBOL(p384_montjmixadd): sub_p384(resy,t2,t1) -// Copy from staging area to actual outputs +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + stp x0, x1, [x_3] stp x2, x3, [x_3+16] stp x4, x5, [x_3+32] - - ldp x0, x1, [resy] - ldp x2, x3, [resy+16] - ldp x4, x5, [resy+32] - stp x0, x1, [y_3] - stp x2, x3, [y_3+16] - stp x4, x5, [y_3+32] - - ldp x0, x1, [resz] - ldp x2, x3, [resz+16] - ldp x4, x5, [resz+32] - stp x0, x1, [z_3] - stp x2, x3, [z_3+16] - stp x4, x5, [z_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] // Restore stack and registers diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S index 5bc5a0c5af..082f77f809 100644 --- a/arm/p521/p521_jmixadd.S +++ b/arm/p521/p521_jmixadd.S @@ -771,35 +771,94 @@ S2N_BN_SYMBOL(p521_jmixadd): sub_p521(resy,t2,t1) -// Copy from staging area to actual outputs +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + orr x0, x0, x1 + ldp x2, x3, [z_1+16] + orr x2, x2, x3 + ldp x4, x5, [z_1+32] + orr x4, x4, x5 + ldp x6, x7, [z_1+48] + orr x6, x6, x7 + ldr x8, [z_1+64] + orr x0, x0, x2 + orr x4, x4, x6 + orr x0, x0, x4 + orr x0, x0, x8 + cmp x0, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. ldp x0, x1, [resx] + ldp x20, x21, [x_2] + csel x0, x0, x20, ne + csel x1, x1, x21, ne ldp x2, x3, [resx+16] + ldp x20, x21, [x_2+16] + csel x2, x2, x20, ne + csel x3, x3, x21, ne ldp x4, x5, [resx+32] + ldp x20, x21, [x_2+32] + csel x4, x4, x20, ne + csel x5, x5, x21, ne ldp x6, x7, [resx+48] + ldp x20, x21, [x_2+48] + csel x6, x6, x20, ne + csel x7, x7, x21, ne ldr x8, [resx+64] + ldr x20, [x_2+64] + csel x8, x8, x20, ne + + ldp x10, x11, [resy] + ldp x20, x21, [y_2] + csel x10, x10, x20, ne + csel x11, x11, x21, ne + ldp x12, x13, [resy+16] + ldp x20, x21, [y_2+16] + csel x12, x12, x20, ne + csel x13, x13, x21, ne + ldp x14, x15, [resy+32] + ldp x20, x21, [y_2+32] + csel x14, x14, x20, ne + csel x15, x15, x21, ne + ldp x16, x17, [resy+48] + ldp x20, x21, [y_2+48] + csel x16, x16, x20, ne + csel x17, x17, x21, ne + ldr x19, [resy+64] + ldr x20, [y_2+64] + csel x19, x19, x20, ne + stp x0, x1, [x_3] stp x2, x3, [x_3+16] stp x4, x5, [x_3+32] stp x6, x7, [x_3+48] str x8, [x_3+64] - - ldp x0, x1, [resy] - ldp x2, x3, [resy+16] - ldp x4, x5, [resy+32] - ldp x6, x7, [resy+48] - ldr x8, [resy+64] - stp x0, x1, [y_3] - stp x2, x3, [y_3+16] - stp x4, x5, [y_3+32] - stp x6, x7, [y_3+48] - str x8, [y_3+64] + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] ldp x0, x1, [resz] + mov x20, #1 + csel x0, x0, x20, ne + csel x1, x1, xzr, ne ldp x2, x3, [resz+16] + csel x2, x2, xzr, ne + csel x3, x3, xzr, ne ldp x4, x5, [resz+32] + csel x4, x4, xzr, ne + csel x5, x5, xzr, ne ldp x6, x7, [resz+48] + csel x6, x6, xzr, ne + csel x7, x7, xzr, ne ldr x8, [resz+64] + csel x8, x8, xzr, ne + stp x0, x1, [z_3] stp x2, x3, [z_3+16] stp x4, x5, [z_3+32] diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S index 91a90ddd59..0126f747ae 100644 --- a/x86_att/p384/p384_montjmixadd.S +++ b/x86_att/p384/p384_montjmixadd.S @@ -838,19 +838,52 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 -#define copy_p384(P0,P1) \ - movq P1, %rax ; \ - movq %rax, P0 ; \ - movq 8+P1, %rax ; \ - movq %rax, 8+P0 ; \ - movq 16+P1, %rax ; \ - movq %rax, 16+P0 ; \ - movq 24+P1, %rax ; \ - movq %rax, 24+P0 ; \ - movq 32+P1, %rax ; \ - movq %rax, 32+P0 ; \ - movq 40+P1, %rax ; \ - movq %rax, 40+P0 +// Additional macros to help with final multiplexing + +#define testzero6(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq 32+P, %rax ; \ + orq 40+P, %rdx ; \ + orq %rdx, %rax + +#define mux6(r0,r1,r2,r3,r4,r5,PNE,PEQ) \ + movq PEQ, %rax ; \ + movq PNE, r0 ; \ + cmovzq %rax, r0 ; \ + movq 8+PEQ, %rax ; \ + movq 8+PNE, r1 ; \ + cmovzq %rax, r1 ; \ + movq 16+PEQ, %rax ; \ + movq 16+PNE, r2 ; \ + cmovzq %rax, r2 ; \ + movq 24+PEQ, %rax ; \ + movq 24+PNE, r3 ; \ + cmovzq %rax, r3 ; \ + movq 32+PEQ, %rax ; \ + movq 32+PNE, r4 ; \ + cmovzq %rax, r4 ; \ + movq 40+PEQ, %rax ; \ + movq 40+PNE, r5 ; \ + cmovzq %rax, r5 + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P S2N_BN_SYMBOL(p384_montjmixadd): @@ -925,11 +958,37 @@ S2N_BN_SYMBOL(p384_montjmixadd): movq input_z, %rdi sub_p384(resy,t2,t1) -// Copy from staging area to actual outputs +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) - copy_p384(x_3,resx) - copy_p384(y_3,resy) - copy_p384(z_3,resz) + movq input_x, %rsi + testzero6(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + movq input_y, %rcx + mux6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_2) + mux6(%r12,%r13,%r14,%r15,%rdx,%rcx,resy,y_2) + + movq input_z, %rdi + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rcx) + + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + movq $0xffffffff00000001, %rax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movq $1, %rax + cmovzq %rax, %r10 + movl $0, %eax + cmovzq %rax, %r11 + cmovzq %rax, %rbx + cmovzq %rax, %rbp + + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) // Restore stack and registers diff --git a/x86_att/p521/p521_jmixadd.S b/x86_att/p521/p521_jmixadd.S index be81c38f84..879fce6954 100644 --- a/x86_att/p521/p521_jmixadd.S +++ b/x86_att/p521/p521_jmixadd.S @@ -670,7 +670,92 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 -#define copy_p521(P0,P1) \ +// Additional macros to help with final multiplexing + +#define testzero9(P) \ + movq P, %rax ; \ + movq 8+P, %rbx ; \ + movq 16+P, %rdx ; \ + movq 24+P, %rbp ; \ + orq 32+P, %rax ; \ + orq 40+P, %rbx ; \ + orq 48+P, %rdx ; \ + orq 56+P, %rbp ; \ + orq %rbx, %rax ; \ + orq %rbp, %rdx ; \ + orq 64+P, %rax ; \ + orq %rdx, %rax + +#define mux9(P0,PNE,PEQ) \ + movq PNE, %rax ; \ + movq PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movq 8+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + movq 16+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + movq 24+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + movq 32+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + movq 40+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + movq 48+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + movq 56+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + movq 64+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define mux9c(P0,PNE) \ + movq PNE, %rax ; \ + movl $1, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movl $0, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define copy9(P0,P1) \ movq P1, %rax ; \ movq %rax, P0 ; \ movq 8+P1, %rax ; \ @@ -746,11 +831,21 @@ S2N_BN_SYMBOL(p521_jmixadd): sub_p521(resy,t2,t1) -// Copy from staging area to actual outputs +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero9(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux9 (resx,resx,x_2) + mux9 (resy,resy,y_2) + + copy9(x_3,resx) + copy9(y_3,resy) - copy_p521(x_3,resx) - copy_p521(y_3,resy) - copy_p521(z_3,resz) + mux9c(z_3,resz) // Restore stack and registers From a44e3e5919b6f342b7d6475e3836b3121bda4368 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 30 Apr 2024 16:10:10 -0700 Subject: [PATCH 20/24] Handle zero inputs in Weierstrass point additions This is analogous to the earlier changes for mixed addition. In a point addition operation P1 + P2, the cases where P1 = 0 or P2 = 0 are handled specially (though of course using constant-time selection) as 0 + P2 = P2 and P1 + 0 = P1. More precisely, writing P1 = (x1,y1,z1) and P2 = (x2,y2,z2), the special-case logic is triggered when precisely *one* of z1 = 0 or z2 = 0 holds; in the case that both z1 = 0 and z2 = 0 the standard computation is followed and yields the "right" result (one with its z coordinate also zero). s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/061ea51722a7fb56112f4401d043c4397b323128 --- arm/p384/p384_montjadd.S | 118 +++++++++++++++++++--- arm/p521/p521_jadd.S | 171 +++++++++++++++++++++++++++++--- x86_att/p384/p384_montjadd.S | 126 ++++++++++++++++++----- x86_att/p384/p384_montjmixadd.S | 13 +-- x86_att/p521/p521_jadd.S | 103 +++++++++++++++++-- 5 files changed, 460 insertions(+), 71 deletions(-) diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S index dc5893f0f2..3686489d9a 100644 --- a/arm/p384/p384_montjadd.S +++ b/arm/p384/p384_montjadd.S @@ -868,28 +868,114 @@ S2N_BN_SYMBOL(p384_montjadd): sub_p384(resy,t2,t1) -// Copy from staging area to actual outputs - +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x6, x7, [z_2] + ldp x8, x9, [z_2+16] + ldp x10, x11, [z_2+32] + + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x12, x13, [resz] + csel x12, x0, x12, lo + csel x13, x1, x13, lo + csel x12, x6, x12, hi + csel x13, x7, x13, hi + ldp x14, x15, [resz+16] + csel x14, x2, x14, lo + csel x15, x3, x15, lo + csel x14, x8, x14, hi + csel x15, x9, x15, hi + ldp x16, x17, [resz+32] + csel x16, x4, x16, lo + csel x17, x5, x17, lo + csel x16, x10, x16, hi + csel x17, x11, x17, hi + + ldp x20, x21, [x_1] ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [y_1] + ldp x6, x7, [resy] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [y_2] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldp x20, x21, [y_1+16] + ldp x8, x9, [resy+16] + csel x8, x20, x8, lo + csel x9, x21, x9, lo + ldp x20, x21, [y_2+16] + csel x8, x20, x8, hi + csel x9, x21, x9, hi + + ldp x20, x21, [y_1+32] + ldp x10, x11, [resy+32] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2+32] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + +// Finally store back the multiplexed values + stp x0, x1, [x_3] stp x2, x3, [x_3+16] stp x4, x5, [x_3+32] - - ldp x0, x1, [resy] - ldp x2, x3, [resy+16] - ldp x4, x5, [resy+32] - stp x0, x1, [y_3] - stp x2, x3, [y_3+16] - stp x4, x5, [y_3+32] - - ldp x0, x1, [resz] - ldp x2, x3, [resz+16] - ldp x4, x5, [resz+32] - stp x0, x1, [z_3] - stp x2, x3, [z_3+16] - stp x4, x5, [z_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] // Restore stack and registers diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S index 0aa55a2385..93ab919aac 100644 --- a/arm/p521/p521_jadd.S +++ b/arm/p521/p521_jadd.S @@ -782,35 +782,180 @@ S2N_BN_SYMBOL(p521_jadd): sub_p521(resy,t2,t1) -// Copy from staging area to actual outputs - +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 +// Multiplex the z outputs accordingly and re-store in resz + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + ldp x6, x7, [z_1+48] + ldr x8, [z_1+64] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x10, x11, [z_2] + ldp x12, x13, [z_2+16] + ldp x14, x15, [z_2+32] + ldp x16, x17, [z_2+48] + ldr x19, [z_2+64] + + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + + ldp x10, x11, [resz] + ldp x12, x13, [resz+16] + ldp x14, x15, [resz+32] + ldp x16, x17, [resz+48] + ldr x19, [resz+64] + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + stp x0, x1, [resz] + stp x2, x3, [resz+16] + stp x4, x5, [resz+32] + stp x6, x7, [resz+48] + str x8, [resz+64] + +// Multiplex the x and y outputs too, keeping the results in registers + + ldp x20, x21, [x_1] ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [x_1+48] ldp x6, x7, [resx+48] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [x_2+48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldr x20, [x_1+64] ldr x8, [resx+64] + csel x8, x20, x8, lo + ldr x21, [x_2+64] + csel x8, x21, x8, hi + + + ldp x20, x21, [y_1] + ldp x10, x11, [resy] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + + ldp x20, x21, [y_1+16] + ldp x12, x13, [resy+16] + csel x12, x20, x12, lo + csel x13, x21, x13, lo + ldp x20, x21, [y_2+16] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + + ldp x20, x21, [y_1+32] + ldp x14, x15, [resy+32] + csel x14, x20, x14, lo + csel x15, x21, x15, lo + ldp x20, x21, [y_2+32] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + + ldp x20, x21, [y_1+48] + ldp x16, x17, [resy+48] + csel x16, x20, x16, lo + csel x17, x21, x17, lo + ldp x20, x21, [y_2+48] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + + ldr x20, [y_1+64] + ldr x19, [resy+64] + csel x19, x20, x19, lo + ldr x21, [y_2+64] + csel x19, x21, x19, hi + +// Finally store back the multiplexed values + stp x0, x1, [x_3] stp x2, x3, [x_3+16] stp x4, x5, [x_3+32] stp x6, x7, [x_3+48] str x8, [x_3+64] - ldp x0, x1, [resy] - ldp x2, x3, [resy+16] - ldp x4, x5, [resy+32] - ldp x6, x7, [resy+48] - ldr x8, [resy+64] - stp x0, x1, [y_3] - stp x2, x3, [y_3+16] - stp x4, x5, [y_3+32] - stp x6, x7, [y_3+48] - str x8, [y_3+64] - ldp x0, x1, [resz] ldp x2, x3, [resz+16] ldp x4, x5, [resz+32] ldp x6, x7, [resz+48] ldr x8, [resz+64] + + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + stp x0, x1, [z_3] stp x2, x3, [z_3+16] stp x4, x5, [z_3+32] diff --git a/x86_att/p384/p384_montjadd.S b/x86_att/p384/p384_montjadd.S index 2312ec8edd..6078082204 100644 --- a/x86_att/p384/p384_montjadd.S +++ b/x86_att/p384/p384_montjadd.S @@ -28,7 +28,8 @@ // Pointer-offset pairs for inputs and outputs // These assume %rdi = p3, %rsi = p1 and %rcx = p2, -// which needs to be set up explicitly before use +// which needs to be set up explicitly before use. +// The %rdi value never changes, however. #define x_1 0(%rsi) #define y_1 NUMSIZE(%rsi) @@ -78,9 +79,8 @@ #define input_x (NUMSIZE*7)(%rsp) #define input_y (NUMSIZE*7+8)(%rsp) -#define input_z (NUMSIZE*7+16)(%rsp) -#define NSPACE (NUMSIZE*7+24) +#define NSPACE (NUMSIZE*7+16) // Corresponds exactly to bignum_montmul_p384 @@ -846,19 +846,51 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 -#define copy_p384(P0,P1) \ - movq P1, %rax ; \ - movq %rax, P0 ; \ - movq 8+P1, %rax ; \ - movq %rax, 8+P0 ; \ - movq 16+P1, %rax ; \ - movq %rax, 16+P0 ; \ - movq 24+P1, %rax ; \ - movq %rax, 24+P0 ; \ - movq 32+P1, %rax ; \ - movq %rax, 32+P0 ; \ - movq 40+P1, %rax ; \ - movq %rax, 40+P0 +// Additional macros to help with final multiplexing + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + +#define czload6(r0,r1,r2,r3,r4,r5,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 ; \ + cmovzq 32+P, r4 ; \ + cmovzq 40+P, r5 + +#define muxload6(r0,r1,r2,r3,r4,r5,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 S2N_BN_SYMBOL(p384_montjadd): @@ -882,7 +914,6 @@ S2N_BN_SYMBOL(p384_montjadd): subq $NSPACE, %rsp - movq %rdi, input_z movq %rsi, input_x movq %rdx, input_y @@ -916,34 +947,77 @@ S2N_BN_SYMBOL(p384_montjadd): montmul_p384(zzx1,zz,x1a) montmul_p384(zzx2,zz,x2a) - movq input_z, %rdi sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) movq input_x, %rsi montmul_p384(xd,xd,z_1) - movq input_z, %rdi sub_p384(resx,resx,zzx2) - movq input_z, %rdi sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y1a) - movq input_z, %rdi movq input_y, %rcx montmul_p384(resz,xd,z_2) montmul_p384(t2,yd,t2) - movq input_z, %rdi sub_p384(resy,t2,t1) -// Copy from staging area to actual outputs +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) +// Multiplex the z outputs accordingly and re-store in resz + + movq input_y, %rcx + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,z_2) + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rbx, %rax + orq %rbp, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + movq input_x, %rsi + load6(%r12,%r13,%r14,%r15,%rdx,%rcx,z_1) + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + cmovzq %rdx, %rbx + cmovzq %rcx, %rbp + orq %r13, %r12 + orq %r15, %r14 + orq %rcx, %rdx + orq %r14, %r12 + orq %r12, %rdx + negq %rdx + sbbq %rdx, %rdx + + cmpq %rdx, %rax + + czload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(resz,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Multiplex the x and y outputs too, keeping the results in registers + + movq input_y, %rcx + movq input_x, %rsi + muxload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_1,x_2) + muxload6(%r12,%r13,%r14,%r15,%rdx,%rax,resy,y_1,y_2) + +// Finally store back the multiplexed values - copy_p384(x_3,resx) - copy_p384(y_3,resy) - copy_p384(z_3,resz) + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rax) + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) // Restore stack and registers diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S index 0126f747ae..539a28117a 100644 --- a/x86_att/p384/p384_montjmixadd.S +++ b/x86_att/p384/p384_montjmixadd.S @@ -30,7 +30,8 @@ // Pointer-offset pairs for inputs and outputs // These assume %rdi = p3, %rsi = p1 and %rcx = p2, -// which needs to be set up explicitly before use +// which needs to be set up explicitly before use. +// However the %rdi value never changes. #define x_1 0(%rsi) #define y_1 NUMSIZE(%rsi) @@ -70,9 +71,8 @@ #define input_x (NUMSIZE*6)(%rsp) #define input_y (NUMSIZE*6+8)(%rsp) -#define input_z (NUMSIZE*6+16)(%rsp) -#define NSPACE (NUMSIZE*6+24) +#define NSPACE (NUMSIZE*6+16) // Corresponds exactly to bignum_montmul_p384 @@ -907,7 +907,6 @@ S2N_BN_SYMBOL(p384_montjmixadd): subq $NSPACE, %rsp - movq %rdi, input_z movq %rsi, input_x movq %rdx, input_y @@ -937,25 +936,20 @@ S2N_BN_SYMBOL(p384_montjmixadd): montmul_p384(zzx1,zz,x_1) montmul_p384(zzx2,zz,x2a) - movq input_z, %rdi sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) - movq input_z, %rdi movq input_x, %rsi montmul_p384(resz,xd,z_1) - movq input_z, %rdi sub_p384(resx,resx,zzx2) - movq input_z, %rdi sub_p384(t2,zzx1,resx) movq input_x, %rsi montmul_p384(t1,t1,y_1) montmul_p384(t2,yd,t2) - movq input_z, %rdi sub_p384(resy,t2,t1) // Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) @@ -972,7 +966,6 @@ S2N_BN_SYMBOL(p384_montjmixadd): mux6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_2) mux6(%r12,%r13,%r14,%r15,%rdx,%rcx,resy,y_2) - movq input_z, %rdi store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rcx) diff --git a/x86_att/p521/p521_jadd.S b/x86_att/p521/p521_jadd.S index 33ef178382..9f1b03c47b 100644 --- a/x86_att/p521/p521_jadd.S +++ b/x86_att/p521/p521_jadd.S @@ -673,7 +673,60 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 -#define copy_p521(P0,P1) \ +// Additional macros to help with final multiplexing + +#define load9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 ; \ + movq 48+P, r6 ; \ + movq 56+P, r7 ; \ + movq 64+P, ra + +#define store9(P,r0,r1,r2,r3,r4,r5,r6,r7,ra) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + movq r6, 48+P ; \ + movq r7, 56+P ; \ + movq ra, 64+P + +#define muxload9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 ; \ + movq 48+P0, r6 ; \ + cmovbq 48+P1, r6 ; \ + cmovnbe 48+P2, r6 ; \ + movq 56+P0, r7 ; \ + cmovbq 56+P1, r7 ; \ + cmovnbe 56+P2, r7 ; \ + movq 64+P0, ra ; \ + cmovbq 64+P1, ra ; \ + cmovnbe 64+P2, ra + +#define copy9(P0,P1) \ movq P1, %rax ; \ movq %rax, P0 ; \ movq 8+P1, %rax ; \ @@ -755,11 +808,49 @@ S2N_BN_SYMBOL(p521_jadd): sub_p521(resy,t2,t1) -// Copy from staging area to actual outputs - - copy_p521(x_3,resx) - copy_p521(y_3,resy) - copy_p521(z_3,resz) +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_1) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_2) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + + cmpq %rax, %rdx + +// Multiplex the outputs accordingly. Re-store them in resz until there +// are no more loads, so there are no assumptions on input-output aliasing + + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resy,y_1,y_2) + store9(resy,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resz,z_1,z_2) + store9(resz,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resx,x_1,x_2) + store9(x_3,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + copy9(y_3,resy) + copy9(z_3,resz) // Restore stack and registers From 0ae7d4f6092c8d82d4e183e22a7917140c90294d Mon Sep 17 00:00:00 2001 From: John Harrison Date: Sun, 5 May 2024 14:10:29 -0700 Subject: [PATCH 21/24] Bifurcate microarchitectural variants of Weierstrass point additions As with the earlier update for doublings, the Jacobian point adidtion and mixed addition operations for the curves P-256, P-384, P-521, secp256k1 and SM2 now all have the usual two versions targeting different microarchitectures, one of them called "_alt", following the general s2n-bignum convention. The "_alt" forms for ARM now present are just renamed versions of the originals (which were based on "_alt" field operations), with the new code taking over the old non-alt name. For x86 the non-alt ones are the same as before and the "_alt" forms are new. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/acd4fd3bd05e12be3f316aea233896cddcd0a9bf --- arm/p384/Makefile | 4 +- arm/p384/p384_montjadd.S | 1285 ++++++++++---------- arm/p384/p384_montjadd_alt.S | 993 +++++++++++++++ arm/p384/p384_montjmixadd.S | 1283 ++++++++++---------- arm/p384/p384_montjmixadd_alt.S | 941 ++++++++++++++ arm/p521/Makefile | 4 +- arm/p521/p521_jadd.S | 1748 +++++++++++++++++---------- arm/p521/p521_jadd_alt.S | 979 +++++++++++++++ arm/p521/p521_jmixadd.S | 1746 ++++++++++++++++---------- arm/p521/p521_jmixadd_alt.S | 882 ++++++++++++++ x86_att/p384/p384_montjadd_alt.S | 965 +++++++++++++++ x86_att/p384/p384_montjmixadd_alt.S | 929 ++++++++++++++ x86_att/p521/p521_jadd_alt.S | 1149 ++++++++++++++++++ x86_att/p521/p521_jmixadd_alt.S | 1144 ++++++++++++++++++ 14 files changed, 11413 insertions(+), 2639 deletions(-) create mode 100644 arm/p384/p384_montjadd_alt.S create mode 100644 arm/p384/p384_montjmixadd_alt.S create mode 100644 arm/p521/p521_jadd_alt.S create mode 100644 arm/p521/p521_jmixadd_alt.S create mode 100644 x86_att/p384/p384_montjadd_alt.S create mode 100644 x86_att/p384/p384_montjmixadd_alt.S create mode 100644 x86_att/p521/p521_jadd_alt.S create mode 100644 x86_att/p521/p521_jmixadd_alt.S diff --git a/arm/p384/Makefile b/arm/p384/Makefile index bfc0870b40..f5fc2aa1a4 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -45,9 +45,11 @@ OBJ = bignum_add_p384.o \ bignum_tomont_p384.o \ bignum_triple_p384.o \ p384_montjadd.o \ + p384_montjadd_alt.o \ p384_montjdouble.o \ p384_montjdouble_alt.o \ - p384_montjmixadd.o + p384_montjmixadd.o \ + p384_montjmixadd_alt.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S index 3686489d9a..3b65363162 100644 --- a/arm/p384/p384_montjadd.S +++ b/arm/p384/p384_montjadd.S @@ -73,715 +73,650 @@ #define NSPACE (NUMSIZE*7) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds to bignum_montmul_p384 except x24 -> x0 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x0, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x3, x4; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x3, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x4, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x0, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x0, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x0, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x0; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ adcs x10, x10, x10; \ adcs x11, x11, x11; \ adcs x12, x12, x12; \ adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -833,8 +768,8 @@ S2N_BN_SYMBOL(p384_montjadd): // Main code, just a sequence of basic field operations // 8 * multiply + 3 * square + 7 * subtract - amontsqr_p384(z1sq,z_1) - amontsqr_p384(z2sq,z_2) + montsqr_p384(z1sq,z_1) + montsqr_p384(z2sq,z_2) montmul_p384(y1a,z_2,y_1) montmul_p384(y2a,z_1,y_2) @@ -847,7 +782,7 @@ S2N_BN_SYMBOL(p384_montjadd): sub_p384(xd,x2a,x1a) sub_p384(yd,y2a,y1a) - amontsqr_p384(zz,xd) + montsqr_p384(zz,xd) montsqr_p384(ww,yd) montmul_p384(zzx1,zz,x1a) diff --git a/arm/p384/p384_montjadd_alt.S b/arm/p384/p384_montjadd_alt.S new file mode 100644 index 0000000000..b84065dea9 --- /dev/null +++ b/arm/p384/p384_montjadd_alt.S @@ -0,0 +1,993 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(z1sq,z_1) + amontsqr_p384(z2sq,z_2) + + montmul_p384(y1a,z_2,y_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,z1sq,x_2) + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(xd,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y1a) + montmul_p384(resz,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x6, x7, [z_2] + ldp x8, x9, [z_2+16] + ldp x10, x11, [z_2+32] + + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x12, x13, [resz] + csel x12, x0, x12, lo + csel x13, x1, x13, lo + csel x12, x6, x12, hi + csel x13, x7, x13, hi + ldp x14, x15, [resz+16] + csel x14, x2, x14, lo + csel x15, x3, x15, lo + csel x14, x8, x14, hi + csel x15, x9, x15, hi + ldp x16, x17, [resz+32] + csel x16, x4, x16, lo + csel x17, x5, x17, lo + csel x16, x10, x16, hi + csel x17, x11, x17, hi + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [y_1] + ldp x6, x7, [resy] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [y_2] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldp x20, x21, [y_1+16] + ldp x8, x9, [resy+16] + csel x8, x20, x8, lo + csel x9, x21, x9, lo + ldp x20, x21, [y_2+16] + csel x8, x20, x8, hi + csel x9, x21, x9, hi + + ldp x20, x21, [y_1+32] + ldp x10, x11, [resy+32] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2+32] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S index faf9fd65a9..f340e4f5ce 100644 --- a/arm/p384/p384_montjmixadd.S +++ b/arm/p384/p384_montjmixadd.S @@ -70,715 +70,650 @@ #define NSPACE (NUMSIZE*6) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds to bignum_montmul_p384 except x24 -> x0 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x0, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x3, x4; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x3, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x4, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x0, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x0, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x0, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x0; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ adcs x10, x10, x10; \ adcs x11, x11, x11; \ adcs x12, x12, x12; \ adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -830,7 +765,7 @@ S2N_BN_SYMBOL(p384_montjmixadd): // Main code, just a sequence of basic field operations // 8 * multiply + 3 * square + 7 * subtract - amontsqr_p384(zp2,z_1) + montsqr_p384(zp2,z_1) montmul_p384(y2a,z_1,y_2) montmul_p384(x2a,zp2,x_2) @@ -839,7 +774,7 @@ S2N_BN_SYMBOL(p384_montjmixadd): sub_p384(xd,x2a,x_1) sub_p384(yd,y2a,y_1) - amontsqr_p384(zz,xd) + montsqr_p384(zz,xd) montsqr_p384(ww,yd) montmul_p384(zzx1,zz,x_1) diff --git a/arm/p384/p384_montjmixadd_alt.S b/arm/p384/p384_montjmixadd_alt.S new file mode 100644 index 0000000000..f36301a11e --- /dev/null +++ b/arm/p384/p384_montjmixadd_alt.S @@ -0,0 +1,941 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(zp2,z_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,zp2,x_2) + montmul_p384(y2a,zp2,y2a) + + sub_p384(xd,x2a,x_1) + sub_p384(yd,y2a,y_1) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne + ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne + ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 7231f2ad9f..7980bdd9d4 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -45,9 +45,11 @@ OBJ = bignum_add_p521.o \ bignum_tomont_p521.o \ bignum_triple_p521.o \ p521_jadd.o \ + p521_jadd_alt.o \ p521_jdouble.o \ p521_jdouble_alt.o \ - p521_jmixadd.o + p521_jmixadd.o \ + p521_jmixadd_alt.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S index 93ab919aac..340766e6a2 100644 --- a/arm/p521/p521_jadd.S +++ b/arm/p521/p521_jadd.S @@ -73,660 +73,32 @@ #define z2sq sp, #(NUMSIZE*5) #define resz sp, #(NUMSIZE*5) -#define y1a sp, #(NUMSIZE*6) +#define tmp sp, #(NUMSIZE*6) -// NUMSIZE*7 is not 16-aligned so we round it up +#define y1a sp, #(NUMSIZE*7) -#define NSPACE (NUMSIZE*7+8) +#define NSPACE (NUMSIZE*8) -// Corresponds exactly to bignum_mul_p521_alt +// For the three field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521 +// and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 #define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_sub_p521 S2N_BN_SYMBOL(p521_jadd): @@ -737,6 +109,7 @@ S2N_BN_SYMBOL(p521_jadd): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -966,12 +339,1097 @@ S2N_BN_SYMBOL(p521_jadd): add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the three field operations, almost identical to +// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521 except for +// avoiding all intial register save-restore, and in the case of +// local_mul_p521, using the tmp buffer as temporary storage and +// avoiding x26. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [tmp] + stp x17, x19, [tmp+16] + stp x21, x11, [tmp+32] + stp x12, x13, [tmp+48] + str x14, [tmp+64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [tmp] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [tmp] + ldp x23, x22, [tmp+16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [tmp+16] + ldp x23, x22, [tmp+32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [tmp+32] + ldp x23, x22, [tmp+48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [tmp+48] + ldr x21, [tmp+64] + adc x21, x21, xzr + str x21, [tmp+64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [tmp] + ldp x5, x6, [tmp+16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [tmp+32] + ldp x9, x10, [tmp+48] + ldr x20, [tmp+64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x25, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x25, x25, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x21, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x21, x21, #8 + extr x24, x23, x21, #8 + adcs x7, x7, x24 + and x25, x25, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x25, x25, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x25, x25, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x25, x25, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x25, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret + +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] + ret +local_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) diff --git a/arm/p521/p521_jadd_alt.S b/arm/p521/p521_jadd_alt.S new file mode 100644 index 0000000000..72c9239be2 --- /dev/null +++ b/arm/p521/p521_jadd_alt.S @@ -0,0 +1,979 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y1a) + mul_p521(resz,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 +// Multiplex the z outputs accordingly and re-store in resz + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + ldp x6, x7, [z_1+48] + ldr x8, [z_1+64] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x10, x11, [z_2] + ldp x12, x13, [z_2+16] + ldp x14, x15, [z_2+32] + ldp x16, x17, [z_2+48] + ldr x19, [z_2+64] + + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + + ldp x10, x11, [resz] + ldp x12, x13, [resz+16] + ldp x14, x15, [resz+32] + ldp x16, x17, [resz+48] + ldr x19, [resz+64] + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + stp x0, x1, [resz] + stp x2, x3, [resz+16] + stp x4, x5, [resz+32] + stp x6, x7, [resz+48] + str x8, [resz+64] + +// Multiplex the x and y outputs too, keeping the results in registers + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [x_1+48] + ldp x6, x7, [resx+48] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [x_2+48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldr x20, [x_1+64] + ldr x8, [resx+64] + csel x8, x20, x8, lo + ldr x21, [x_2+64] + csel x8, x21, x8, hi + + + ldp x20, x21, [y_1] + ldp x10, x11, [resy] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + + ldp x20, x21, [y_1+16] + ldp x12, x13, [resy+16] + csel x12, x20, x12, lo + csel x13, x21, x13, lo + ldp x20, x21, [y_2+16] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + + ldp x20, x21, [y_1+32] + ldp x14, x15, [resy+32] + csel x14, x20, x14, lo + csel x15, x21, x15, lo + ldp x20, x21, [y_2+32] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + + ldp x20, x21, [y_1+48] + ldp x16, x17, [resy+48] + csel x16, x20, x16, lo + csel x17, x21, x17, lo + ldp x20, x21, [y_2+48] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + + ldr x20, [y_1+64] + ldr x19, [resy+64] + csel x19, x20, x19, lo + ldr x21, [y_2+64] + csel x19, x21, x19, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S index 082f77f809..b04e39327f 100644 --- a/arm/p521/p521_jmixadd.S +++ b/arm/p521/p521_jmixadd.S @@ -72,656 +72,30 @@ #define xd sp, #(NUMSIZE*5) #define resz sp, #(NUMSIZE*5) -#define NSPACE (NUMSIZE*6) +#define tmp sp, #(NUMSIZE*6) -// Corresponds exactly to bignum_mul_p521_alt +#define NSPACE (NUMSIZE*7+8) + +// For the three field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521 +// and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 #define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_sub_p521 S2N_BN_SYMBOL(p521_jmixadd): @@ -732,6 +106,7 @@ S2N_BN_SYMBOL(p521_jmixadd): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -869,12 +244,1097 @@ S2N_BN_SYMBOL(p521_jmixadd): add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the three field operations, almost identical to +// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521 except for +// avoiding all intial register save-restore, and in the case of +// local_mul_p521, using the tmp buffer as temporary storage and +// avoiding x26. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [tmp] + stp x17, x19, [tmp+16] + stp x21, x11, [tmp+32] + stp x12, x13, [tmp+48] + str x14, [tmp+64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [tmp] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [tmp] + ldp x23, x22, [tmp+16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [tmp+16] + ldp x23, x22, [tmp+32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [tmp+32] + ldp x23, x22, [tmp+48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [tmp+48] + ldr x21, [tmp+64] + adc x21, x21, xzr + str x21, [tmp+64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [tmp] + ldp x5, x6, [tmp+16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [tmp+32] + ldp x9, x10, [tmp+48] + ldr x20, [tmp+64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x25, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x25, x25, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x21, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x21, x21, #8 + extr x24, x23, x21, #8 + adcs x7, x7, x24 + and x25, x25, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x25, x25, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x25, x25, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x25, x25, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x25, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret + +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] + ret +local_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) diff --git a/arm/p521/p521_jmixadd_alt.S b/arm/p521/p521_jmixadd_alt.S new file mode 100644 index 0000000000..783ca28cf8 --- /dev/null +++ b/arm/p521/p521_jmixadd_alt.S @@ -0,0 +1,882 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(resz,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + orr x0, x0, x1 + ldp x2, x3, [z_1+16] + orr x2, x2, x3 + ldp x4, x5, [z_1+32] + orr x4, x4, x5 + ldp x6, x7, [z_1+48] + orr x6, x6, x7 + ldr x8, [z_1+64] + orr x0, x0, x2 + orr x4, x4, x6 + orr x0, x0, x4 + orr x0, x0, x8 + cmp x0, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x20, x21, [x_2] + csel x0, x0, x20, ne + csel x1, x1, x21, ne + ldp x2, x3, [resx+16] + ldp x20, x21, [x_2+16] + csel x2, x2, x20, ne + csel x3, x3, x21, ne + ldp x4, x5, [resx+32] + ldp x20, x21, [x_2+32] + csel x4, x4, x20, ne + csel x5, x5, x21, ne + ldp x6, x7, [resx+48] + ldp x20, x21, [x_2+48] + csel x6, x6, x20, ne + csel x7, x7, x21, ne + ldr x8, [resx+64] + ldr x20, [x_2+64] + csel x8, x8, x20, ne + + ldp x10, x11, [resy] + ldp x20, x21, [y_2] + csel x10, x10, x20, ne + csel x11, x11, x21, ne + ldp x12, x13, [resy+16] + ldp x20, x21, [y_2+16] + csel x12, x12, x20, ne + csel x13, x13, x21, ne + ldp x14, x15, [resy+32] + ldp x20, x21, [y_2+32] + csel x14, x14, x20, ne + csel x15, x15, x21, ne + ldp x16, x17, [resy+48] + ldp x20, x21, [y_2+48] + csel x16, x16, x20, ne + csel x17, x17, x21, ne + ldr x19, [resy+64] + ldr x20, [y_2+64] + csel x19, x19, x20, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + ldp x0, x1, [resz] + mov x20, #1 + csel x0, x0, x20, ne + csel x1, x1, xzr, ne + ldp x2, x3, [resz+16] + csel x2, x2, xzr, ne + csel x3, x3, xzr, ne + ldp x4, x5, [resz+32] + csel x4, x4, xzr, ne + csel x5, x5, xzr, ne + ldp x6, x7, [resz+48] + csel x6, x6, xzr, ne + csel x7, x7, xzr, ne + ldr x8, [resz+64] + csel x8, x8, xzr, ne + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjadd_alt.S b/x86_att/p384/p384_montjadd_alt.S new file mode 100644 index 0000000000..e36a60f331 --- /dev/null +++ b/x86_att/p384/p384_montjadd_alt.S @@ -0,0 +1,965 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use. +// The %rdi value never changes, however. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) +#define z_2 (2*NUMSIZE)(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// In one place it's convenient to use another register +// since the squaring function overwrites %rcx + +#define z_2_alt (2*NUMSIZE)(%rsi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*7)(%rsp) +#define input_y (NUMSIZE*7+8)(%rsp) + +#define NSPACE (NUMSIZE*7+16) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Additional macros to help with final multiplexing + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + +#define czload6(r0,r1,r2,r3,r4,r5,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 ; \ + cmovzq 32+P, r4 ; \ + cmovzq 40+P, r5 + +#define muxload6(r0,r1,r2,r3,r4,r5,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 + +S2N_BN_SYMBOL(p384_montjadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p384(z1sq,z_1) + movq input_y, %rsi + montsqr_p384(z2sq,z_2_alt) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y1a,z_2,y_1) + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,z1sq,x_2) + movq input_x, %rsi + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + montsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_x, %rsi + montmul_p384(xd,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y1a) + + movq input_y, %rcx + montmul_p384(resz,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) +// Multiplex the z outputs accordingly and re-store in resz + + movq input_y, %rcx + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,z_2) + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rbx, %rax + orq %rbp, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + movq input_x, %rsi + load6(%r12,%r13,%r14,%r15,%rdx,%rcx,z_1) + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + cmovzq %rdx, %rbx + cmovzq %rcx, %rbp + orq %r13, %r12 + orq %r15, %r14 + orq %rcx, %rdx + orq %r14, %r12 + orq %r12, %rdx + negq %rdx + sbbq %rdx, %rdx + + cmpq %rdx, %rax + + czload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(resz,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Multiplex the x and y outputs too, keeping the results in registers + + movq input_y, %rcx + movq input_x, %rsi + muxload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_1,x_2) + muxload6(%r12,%r13,%r14,%r15,%rdx,%rax,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rax) + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjmixadd_alt.S b/x86_att/p384/p384_montjmixadd_alt.S new file mode 100644 index 0000000000..da610ee88e --- /dev/null +++ b/x86_att/p384/p384_montjmixadd_alt.S @@ -0,0 +1,929 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use. +// However the %rdi value never changes. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*6)(%rsp) +#define input_y (NUMSIZE*6+8)(%rsp) + +#define NSPACE (NUMSIZE*6+16) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Additional macros to help with final multiplexing + +#define testzero6(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq 32+P, %rax ; \ + orq 40+P, %rdx ; \ + orq %rdx, %rax + +#define mux6(r0,r1,r2,r3,r4,r5,PNE,PEQ) \ + movq PEQ, %rax ; \ + movq PNE, r0 ; \ + cmovzq %rax, r0 ; \ + movq 8+PEQ, %rax ; \ + movq 8+PNE, r1 ; \ + cmovzq %rax, r1 ; \ + movq 16+PEQ, %rax ; \ + movq 16+PNE, r2 ; \ + cmovzq %rax, r2 ; \ + movq 24+PEQ, %rax ; \ + movq 24+PNE, r3 ; \ + cmovzq %rax, r3 ; \ + movq 32+PEQ, %rax ; \ + movq 32+PNE, r4 ; \ + cmovzq %rax, r4 ; \ + movq 40+PEQ, %rax ; \ + movq 40+PNE, r5 ; \ + cmovzq %rax, r5 + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P + +S2N_BN_SYMBOL(p384_montjmixadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p384(zp2,z_1) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,zp2,x_2) + + montmul_p384(y2a,zp2,y2a) + + movq input_x, %rsi + sub_p384(xd,x2a,x_1) + movq input_x, %rsi + sub_p384(yd,y2a,y_1) + + montsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + movq input_x, %rsi + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_x, %rsi + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + movq input_x, %rsi + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + movq input_x, %rsi + testzero6(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + movq input_y, %rcx + mux6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_2) + mux6(%r12,%r13,%r14,%r15,%rdx,%rcx,resy,y_2) + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rcx) + + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + movq $0xffffffff00000001, %rax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movq $1, %rax + cmovzq %rax, %r10 + movl $0, %eax + cmovzq %rax, %r11 + cmovzq %rax, %rbx + cmovzq %rax, %rbp + + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jadd_alt.S b/x86_att/p521/p521_jadd_alt.S new file mode 100644 index 0000000000..5b51a4f6a6 --- /dev/null +++ b/x86_att/p521/p521_jadd_alt.S @@ -0,0 +1,1149 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) +#define z_2 (2*NUMSIZE)(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*8) + +// Corresponds exactly to bignum_mul_p521_alt except temp storage + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except %rbp is used +// in place of %rcx and tmp is the temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rbp, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rbp, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rbp, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rbp, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Additional macros to help with final multiplexing + +#define load9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 ; \ + movq 48+P, r6 ; \ + movq 56+P, r7 ; \ + movq 64+P, ra + +#define store9(P,r0,r1,r2,r3,r4,r5,r6,r7,ra) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + movq r6, 48+P ; \ + movq r7, 56+P ; \ + movq ra, 64+P + +#define muxload9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 ; \ + movq 48+P0, r6 ; \ + cmovbq 48+P1, r6 ; \ + cmovnbe 48+P2, r6 ; \ + movq 56+P0, r7 ; \ + cmovbq 56+P1, r7 ; \ + cmovnbe 56+P2, r7 ; \ + movq 64+P0, ra ; \ + cmovbq 64+P1, ra ; \ + cmovnbe 64+P2, ra + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + +S2N_BN_SYMBOL(p521_jadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y1a) + mul_p521(resz,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_1) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_2) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + + cmpq %rax, %rdx + +// Multiplex the outputs accordingly. Re-store them in resz until there +// are no more loads, so there are no assumptions on input-output aliasing + + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resy,y_1,y_2) + store9(resy,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resz,z_1,z_2) + store9(resz,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resx,x_1,x_2) + store9(x_3,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + copy9(y_3,resy) + copy9(z_3,resz) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p521/p521_jmixadd_alt.S b/x86_att/p521/p521_jmixadd_alt.S new file mode 100644 index 0000000000..d9279fe305 --- /dev/null +++ b/x86_att/p521/p521_jmixadd_alt.S @@ -0,0 +1,1144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define tmp (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_mul_p521_alt except temp storage + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 432(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 440(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 448(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 456(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 464(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 472(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 480(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 488(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 496(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 496(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except %rbp is used +// in place of %rcx and the output as temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 432(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 440(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 448(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 456(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 464(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 472(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rbp, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 480(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 488(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rbp, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 496(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rbp, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rbp, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 496(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 ; \ + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Additional macros to help with final multiplexing + +#define testzero9(P) \ + movq P, %rax ; \ + movq 8+P, %rbx ; \ + movq 16+P, %rdx ; \ + movq 24+P, %rbp ; \ + orq 32+P, %rax ; \ + orq 40+P, %rbx ; \ + orq 48+P, %rdx ; \ + orq 56+P, %rbp ; \ + orq %rbx, %rax ; \ + orq %rbp, %rdx ; \ + orq 64+P, %rax ; \ + orq %rdx, %rax + +#define mux9(P0,PNE,PEQ) \ + movq PNE, %rax ; \ + movq PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movq 8+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + movq 16+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + movq 24+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + movq 32+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + movq 40+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + movq 48+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + movq 56+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + movq 64+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define mux9c(P0,PNE) \ + movq PNE, %rax ; \ + movl $1, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movl $0, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + +S2N_BN_SYMBOL(p521_jmixadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(resz,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero9(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux9 (resx,resx,x_2) + mux9 (resy,resy,y_2) + + copy9(x_3,resx) + copy9(y_3,resy) + + mux9c(z_3,resz) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif From a6e023a53d358a9ca72ca53ba14f55d1d66d8e8f Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Fri, 26 Apr 2024 16:25:57 -0500 Subject: [PATCH 22/24] Add `bignum_mont{mul,sqr}_p384_neon`, speed improvements/refactoring in tactics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds `bignum_mont{mul,sqr}_p384_neon` which are slightly faster than `bignum_mont{mul,sqr}_p384`. They use SIMD instructions and better scheduling found with SLOTHY. Their correctness is verified using equivalence check w.r.t. specifications of their scalar ops. The new SUBROUTINE lemmas are added to the specification list using ``` ./tools/collect-specs.sh arm >arm/proofs/specifications.txt ``` Benchmark results on Graviton2: ``` bignum_montsqr_p384 : 58.6 ns each (var 0.3%, corr 0.06) = 17053295 ops/sec bignum_montsqr_p384_neon : 52.6 ns each (var 0.4%, corr -0.04) = 19017192 ops/sec bignum_montmul_p384 : 72.9 ns each (var 0.2%, corr -0.02) = 13726633 ops/sec bignum_montmul_p384_neon : 68.1 ns each (var 0.3%, corr 0.02) = 14680905 ops/sec ``` Test and benchmark were updated to include these & fix incorrect naming bugs in my previous p256_neon patch. Also, some speedups in tactics are made: 1. `ARM_STEPS'_AND_ABBREV_TAC` and `ARM_STEPS'_AND_REWRITE_TAC`. They are tactics for symbolic execution when showing equivalence of two programs after reordering instructions. `ARM_STEPS'_AND_ABBREV_TAC` does symbolic execution of the 'left' program and abbreviates every RHS of new `read comp s = RHS`s, meaning that after the tactic is done there are a bunch of equality assumptions whose number increases linearly to the number of instructions. `ARM_STEPS'_AND_REWRITE_TAC` then does symbolic execution of the 'right' program and rewrites the results using the assumptions. This means the overall complexity of `ARM_STEPS'_AND_REWRITE_TAC` was quadratic to the number of instructions (# assum * # insts = (# insts)^2). This is fixed to be (close to) linear, by separately maintaining the abbreviations as a list of theorems internally rather than assumptions. This doesn’t mean that the therotical time complexity is now linear, but many tactics inside `ARM_STEPS'_AND_REWRITE_TAC` that inspect assumptions now run linearly. 2. `FIND_HOLE_TAC` `FIND_HOLE_TAC` tactic finds the 'hole' in the memory space that can place the machine code that is used in program equivalence. This is done by inspecting `nonoverlapping` assumptions, properly segmenting the memory with fixed-width ranges and doing case analysis. Previously the # splitted cases was something like 2^((# segments)^2), but now it is reduced to (# segments)^(#segments). Comparing these two numbers is easier if logarithm is used. Finally, some lemmas in existing `_neon.ml` proofs are updated so that they do not mix usage of '*_mc' and '*_core_mc'. '*_core_mc' is a machine code that is a sub-list of '*_mc' retrieved by stripping the callee-save register store/loads as well as the ret instruction. If possible, a lemmas is updated to only use '*_core_mc' because this makes the modular usage of the lemma possible in bigger theorems. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/d3a7b195595ec7c265e30731ad75065442cd0ab0 --- arm/p384/Makefile | 2 + arm/p384/bignum_montmul_p384_neon.S | 885 ++++++++++++++++++++++++++++ arm/p384/bignum_montsqr_p384_neon.S | 665 +++++++++++++++++++++ 3 files changed, 1552 insertions(+) create mode 100644 arm/p384/bignum_montmul_p384_neon.S create mode 100644 arm/p384/bignum_montsqr_p384_neon.S diff --git a/arm/p384/Makefile b/arm/p384/Makefile index f5fc2aa1a4..60687fb7c1 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -35,8 +35,10 @@ OBJ = bignum_add_p384.o \ bignum_mod_p384_6.o \ bignum_montmul_p384.o \ bignum_montmul_p384_alt.o \ + bignum_montmul_p384_neon.o \ bignum_montsqr_p384.o \ bignum_montsqr_p384_alt.o \ + bignum_montsqr_p384_neon.o \ bignum_mux_6.o \ bignum_neg_p384.o \ bignum_nonzero_6.o \ diff --git a/arm/p384/bignum_montmul_p384_neon.S b/arm/p384/bignum_montmul_p384_neon.S new file mode 100644 index 0000000000..08c296bc0d --- /dev/null +++ b/arm/p384/bignum_montmul_p384_neon.S @@ -0,0 +1,885 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^384) mod p_384 +// Inputs x[6], y[6]; output z[6] +// +// extern void bignum_montmul_p384_neon +// (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); +// +// Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y +// satisfy x * y <= 2^384 * p_384 (in particular this is true if we are in +// the "usual" case x < p_384 and y < p_384). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- + +// bignum_montmul_p384_neon is functionally equivalent to bignum_montmul_p384. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x3, x21, [x1] +// ldr q30, [x1] +// ldp x8, x24, [x1, #16] +// ldp x5, x10, [x1, #32] +// ldp x13, x23, [x2] +// ldr q19, [x2] +// ldp x6, x14, [x2, #16] +// ldp x15, x17, [x2, #32] +// ldr q1, [x1, #32] +// ldr q28, [x2, #32] +// uzp1 v5.4S, v19.4S, v30.4S +// rev64 v19.4S, v19.4S +// uzp1 v0.4S, v30.4S, v30.4S +// mul v21.4S, v19.4S, v30.4S +// uaddlp v19.2D, v21.4S +// shl v19.2D, v19.2D, #32 +// umlal v19.2D, v0.2S, v5.2S +// mov x12, v19.d[0] +// mov x16, v19.d[1] +// mul x20, x8, x6 +// umulh x4, x3, x13 +// umulh x1, x21, x23 +// umulh x2, x8, x6 +// adds x4, x4, x16 +// adcs x19, x1, x20 +// adc x20, x2, xzr +// adds x11, x4, x12 +// adcs x16, x19, x4 +// adcs x1, x20, x19 +// adc x2, x20, xzr +// adds x7, x16, x12 +// adcs x4, x1, x4 +// adcs x9, x2, x19 +// adc x19, x20, xzr +// subs x2, x3, x21 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x23, x13 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x11, x11, x1 +// adcs x7, x7, x2 +// adcs x4, x4, x16 +// adcs x9, x9, x16 +// adc x19, x19, x16 +// subs x2, x3, x8 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x6, x13 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x7, x7, x1 +// adcs x4, x4, x2 +// adcs x9, x9, x16 +// adc x19, x19, x16 +// subs x2, x21, x8 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x6, x23 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x4, x4, x1 +// adcs x20, x9, x2 +// adc x16, x19, x16 +// lsl x2, x12, #32 +// add x19, x2, x12 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x1, x2, x1, #32 +// lsr x2, x2, #32 +// adds x12, x2, x19 +// adc x2, xzr, xzr +// subs x1, x11, x1 +// sbcs x7, x7, x12 +// sbcs x4, x4, x2 +// sbcs x20, x20, xzr +// sbcs x16, x16, xzr +// sbc x9, x19, xzr +// lsl x2, x1, #32 +// add x19, x2, x1 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x1, x2, x1, #32 +// lsr x2, x2, #32 +// adds x12, x2, x19 +// adc x2, xzr, xzr +// subs x1, x7, x1 +// sbcs x4, x4, x12 +// sbcs x20, x20, x2 +// sbcs x16, x16, xzr +// sbcs x7, x9, xzr +// sbc x9, x19, xzr +// lsl x2, x1, #32 +// add x19, x2, x1 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x12, x2, x1, #32 +// lsr x2, x2, #32 +// adds x1, x2, x19 +// adc x2, xzr, xzr +// subs x4, x4, x12 +// sbcs x20, x20, x1 +// sbcs x16, x16, x2 +// sbcs x12, x7, xzr +// sbcs x1, x9, xzr +// sbc x2, x19, xzr +// stp x4, x20, [x0] // @slothy:writes=buffer0 +// stp x16, x12, [x0, #16] // @slothy:writes=buffer16 +// stp x1, x2, [x0, #32] // @slothy:writes=buffer32 +// mul x22, x24, x14 +// movi v31.2D, #0x00000000ffffffff +// uzp2 v16.4S, v28.4S, v28.4S +// xtn v6.2S, v1.2D +// xtn v30.2S, v28.2D +// rev64 v28.4S, v28.4S +// umull v5.2D, v6.2S, v30.2S +// umull v0.2D, v6.2S, v16.2S +// uzp2 v19.4S, v1.4S, v1.4S +// mul v20.4S, v28.4S, v1.4S +// usra v0.2D, v5.2D, #32 +// umull v1.2D, v19.2S, v16.2S +// uaddlp v24.2D, v20.4S +// and v5.16B, v0.16B, v31.16B +// umlal v5.2D, v19.2S, v30.2S +// shl v19.2D, v24.2D, #32 +// usra v1.2D, v0.2D, #32 +// umlal v19.2D, v6.2S, v30.2S +// usra v1.2D, v5.2D, #32 +// mov x20, v19.d[0] +// mov x16, v19.d[1] +// umulh x12, x24, x14 +// mov x1, v1.d[0] +// mov x2, v1.d[1] +// adds x4, x12, x20 +// adcs x20, x1, x16 +// adc x16, x2, xzr +// adds x7, x4, x22 +// adcs x12, x20, x4 +// adcs x1, x16, x20 +// adc x2, x16, xzr +// adds x9, x12, x22 +// adcs x19, x1, x4 +// adcs x4, x2, x20 +// adc x20, x16, xzr +// subs x2, x24, x5 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x15, x14 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x12, x12, cc +// eor x1, x1, x12 +// eor x2, x2, x12 +// cmn x12, #0x1 +// adcs x11, x7, x1 +// adcs x9, x9, x2 +// adcs x19, x19, x12 +// adcs x4, x4, x12 +// adc x20, x20, x12 +// subs x2, x24, x10 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x17, x14 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x12, x12, cc +// eor x1, x1, x12 +// eor x2, x2, x12 +// cmn x12, #0x1 +// adcs x7, x9, x1 +// adcs x19, x19, x2 +// adcs x4, x4, x12 +// adc x20, x20, x12 +// subs x2, x5, x10 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x17, x15 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x16, x12, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x19, x19, x1 +// adcs x12, x4, x2 +// adc x1, x20, x16 +// subs x2, x24, x3 +// sbcs x24, x5, x21 +// sbcs x21, x10, x8 +// ngc x5, xzr +// cmn x5, #0x1 +// eor x2, x2, x5 +// adcs x4, x2, xzr +// eor x2, x24, x5 +// adcs x20, x2, xzr +// eor x2, x21, x5 +// adc x16, x2, xzr +// subs x2, x13, x14 +// sbcs x24, x23, x15 +// sbcs x8, x6, x17 +// ngc x21, xzr +// cmn x21, #0x1 +// eor x2, x2, x21 +// adcs x15, x2, xzr +// eor x2, x24, x21 +// adcs x14, x2, xzr +// eor x2, x8, x21 +// adc x6, x2, xzr +// eor x9, x5, x21 +// ldp x21, x2, [x0] // @slothy:reads=buffer0 +// adds x10, x22, x21 +// adcs x5, x11, x2 +// ldp x21, x2, [x0, #16] // @slothy:reads=buffer16 +// adcs x24, x7, x21 +// adcs x8, x19, x2 +// ldp x21, x2, [x0, #32] // @slothy:reads=buffer32 +// adcs x21, x12, x21 +// adcs x2, x1, x2 +// adc x19, xzr, xzr +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x24, x8, [x0, #16] // @slothy:writes=buffer16 +// stp x21, x2, [x0, #32] // @slothy:writes=buffer32 +// mul x12, x4, x15 +// mul x5, x20, x14 +// mul x24, x16, x6 +// umulh x8, x4, x15 +// umulh x21, x20, x14 +// umulh x2, x16, x6 +// adds x10, x8, x5 +// adcs x5, x21, x24 +// adc x24, x2, xzr +// adds x23, x10, x12 +// adcs x8, x5, x10 +// adcs x21, x24, x5 +// adc x2, x24, xzr +// adds x13, x8, x12 +// adcs x1, x21, x10 +// adcs x10, x2, x5 +// adc x5, x24, xzr +// subs x2, x4, x20 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x14, x15 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x8, x8, cc +// eor x21, x21, x8 +// eor x2, x2, x8 +// cmn x8, #0x1 +// adcs x23, x23, x21 +// adcs x13, x13, x2 +// adcs x1, x1, x8 +// adcs x10, x10, x8 +// adc x5, x5, x8 +// subs x2, x4, x16 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x6, x15 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x8, x8, cc +// eor x21, x21, x8 +// eor x2, x2, x8 +// cmn x8, #0x1 +// adcs x4, x13, x21 +// adcs x13, x1, x2 +// adcs x1, x10, x8 +// adc x10, x5, x8 +// subs x2, x20, x16 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x6, x14 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x5, x8, cc +// eor x21, x21, x5 +// eor x2, x2, x5 +// cmn x5, #0x1 +// adcs x24, x13, x21 +// adcs x8, x1, x2 +// adc x21, x10, x5 +// ldp x20, x16, [x0] // @slothy:reads=buffer0 +// ldp x17, x15, [x0, #16] // @slothy:reads=buffer16 +// ldp x14, x6, [x0, #32] // @slothy:reads=buffer32 +// cmn x9, #0x1 +// eor x2, x12, x9 +// adcs x12, x2, x20 +// eor x2, x23, x9 +// adcs x23, x2, x16 +// eor x2, x4, x9 +// adcs x13, x2, x17 +// eor x2, x24, x9 +// adcs x10, x2, x15 +// eor x2, x8, x9 +// adcs x5, x2, x14 +// eor x2, x21, x9 +// adcs x24, x2, x6 +// adcs x1, x9, x19 +// adcs x8, x9, xzr +// adcs x21, x9, xzr +// adc x2, x9, xzr +// adds x10, x10, x20 +// adcs x5, x5, x16 +// adcs x24, x24, x17 +// adcs x17, x1, x15 +// adcs x15, x8, x14 +// adcs x14, x21, x6 +// adc x6, x2, x19 +// lsl x2, x12, #32 +// add x1, x2, x12 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x21, x2, x21, #32 +// lsr x2, x2, #32 +// adds x8, x2, x1 +// adc x2, xzr, xzr +// subs x21, x23, x21 +// sbcs x23, x13, x8 +// sbcs x10, x10, x2 +// sbcs x5, x5, xzr +// sbcs x24, x24, xzr +// sbc x13, x1, xzr +// lsl x2, x21, #32 +// add x1, x2, x21 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x21, x2, x21, #32 +// lsr x2, x2, #32 +// adds x8, x2, x1 +// adc x2, xzr, xzr +// subs x21, x23, x21 +// sbcs x10, x10, x8 +// sbcs x5, x5, x2 +// sbcs x24, x24, xzr +// sbcs x23, x13, xzr +// sbc x13, x1, xzr +// lsl x2, x21, #32 +// add x1, x2, x21 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x8, x2, x21, #32 +// lsr x2, x2, #32 +// adds x21, x2, x1 +// adc x2, xzr, xzr +// subs x10, x10, x8 +// sbcs x5, x5, x21 +// sbcs x24, x24, x2 +// sbcs x8, x23, xzr +// sbcs x21, x13, xzr +// sbc x2, x1, xzr +// adds x23, x17, x8 +// adcs x13, x15, x21 +// adcs x1, x14, x2 +// adc x2, x6, xzr +// add x8, x2, #0x1 +// lsl x2, x8, #32 +// subs x21, x8, x2 +// sbc x2, x2, xzr +// adds x10, x10, x21 +// adcs x5, x5, x2 +// adcs x24, x24, x8 +// adcs x8, x23, xzr +// adcs x21, x13, xzr +// adcs x13, x1, xzr +// csetm x1, cc +// mov x2, #0xffffffff +// and x2, x2, x1 +// adds x10, x10, x2 +// eor x2, x2, x1 +// adcs x5, x5, x2 +// mov x2, #0xfffffffffffffffe +// and x2, x2, x1 +// adcs x24, x24, x2 +// adcs x8, x8, x1 +// adcs x21, x21, x1 +// adc x2, x13, x1 +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x24, x8, [x0, #16] // @slothy:writes=buffer16 +// stp x21, x2, [x0, #32] // @slothy:writes=buffer32 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' and +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p384_neon): + +// Save some registers + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + + ldr q3, [x1] + ldr q25, [x2] + ldp x13, x23, [x2] + ldp x3, x21, [x1] + rev64 v23.4S, v25.4S + uzp1 v17.4S, v25.4S, v3.4S + umulh x15, x3, x13 + mul v6.4S, v23.4S, v3.4S + uzp1 v3.4S, v3.4S, v3.4S + ldr q27, [x2, #32] + ldp x8, x24, [x1, #16] + subs x6, x3, x21 + ldr q0, [x1, #32] + movi v23.2D, #0x00000000ffffffff + csetm x10, cc + umulh x19, x21, x23 + rev64 v4.4S, v27.4S + uzp2 v25.4S, v27.4S, v27.4S + cneg x4, x6, cc + subs x7, x23, x13 + xtn v22.2S, v0.2D + xtn v24.2S, v27.2D + cneg x20, x7, cc + ldp x6, x14, [x2, #16] + mul v27.4S, v4.4S, v0.4S + uaddlp v20.2D, v6.4S + cinv x5, x10, cc + mul x16, x4, x20 + uzp2 v6.4S, v0.4S, v0.4S + umull v21.2D, v22.2S, v25.2S + shl v0.2D, v20.2D, #32 + umlal v0.2D, v3.2S, v17.2S + mul x22, x8, x6 + umull v1.2D, v6.2S, v25.2S + subs x12, x3, x8 + umull v20.2D, v22.2S, v24.2S + cneg x17, x12, cc + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc + usra v21.2D, v20.2D, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2D, v21.2D, #32 + adds x22, x15, x7 + and v26.16B, v21.16B, v23.16B + adcs x16, x12, x15 + uaddlp v25.2D, v27.4S + adcs x9, x19, x12 + umlal v26.2D, v6.2S, v24.2S + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2D, v25.2D, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc + cinv x10, x10, cc + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc + eor x19, x19, x10 + csetm x4, cc + subs x16, x6, x23 + cneg x16, x16, cc + umlal v27.2D, v22.2S, v24.2S + mul x15, x20, x16 + cinv x4, x4, cc + cmn x10, #0x1 + usra v1.2D, v26.2D, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x2, #32] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x1, #32] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [x0] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [x0, #16] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc + csetm x2, cc + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc + cneg x19, x19, cc + stp x9, x20, [x0, #32] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc + csetm x12, cc + subs x9, x17, x14 + cinv x12, x12, cc + cneg x9, x9, cc + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc + cneg x24, x10, cc + subs x10, x17, x15 + cinv x7, x7, cc + cneg x10, x10, cc + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [x0] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [x0, #16] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [x0, #32] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [x0] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [x0, #16] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [x0, #32] + cneg x3, x21, cc + csetm x24, cc + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc + csetm x16, cc + subs x21, x6, x15 + cneg x22, x21, cc + cinv x21, x24, cc + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc + csetm x24, cc + subs x20, x14, x15 + cinv x24, x24, cc + mul x22, x3, x22 + cneg x3, x20, cc + subs x13, x6, x14 + cneg x20, x13, cc + cinv x15, x16, cc + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [x0] + ldp x21, x12, [x0, #16] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [x0, #32] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [x0] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [x0, #16] + adc x12, x15, x23 + stp x21, x12, [x0, #32] + +// Restore registers and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/p384/bignum_montsqr_p384_neon.S b/arm/p384/bignum_montsqr_p384_neon.S new file mode 100644 index 0000000000..9be6380eb4 --- /dev/null +++ b/arm/p384/bignum_montsqr_p384_neon.S @@ -0,0 +1,665 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^384) mod p_384 +// Input x[6]; output z[6] +// +// extern void bignum_montsqr_p384_neon +// (uint64_t z[static 6], uint64_t x[static 6]); +// +// Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is +// guaranteed in particular if x < p_384 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- + +// bignum_montsqr_p384_neon is functionally equivalent to bignum_montsqr_p384. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montsqr_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// ldp x9, x2, [x1] +// ldr q18, [x1] +// ldr q19, [x1] +// ldp x4, x6, [x1, #16] +// ldp x5, x10, [x1, #32] +// ldr q21, [x1, #32] +// ldr q28, [x1, #32] +// mul x12, x9, x2 +// mul x1, x9, x4 +// mul x13, x2, x4 +// movi v0.2D, #0x00000000ffffffff +// uzp2 v5.4S, v19.4S, v19.4S +// xtn v25.2S, v18.2D +// xtn v4.2S, v19.2D +// rev64 v23.4S, v19.4S +// umull v20.2D, v25.2S, v4.2S +// umull v30.2D, v25.2S, v5.2S +// uzp2 v19.4S, v18.4S, v18.4S +// mul v22.4S, v23.4S, v18.4S +// usra v30.2D, v20.2D, #32 +// umull v18.2D, v19.2S, v5.2S +// uaddlp v22.2D, v22.4S +// and v20.16B, v30.16B, v0.16B +// umlal v20.2D, v19.2S, v4.2S +// shl v19.2D, v22.2D, #32 +// usra v18.2D, v30.2D, #32 +// umlal v19.2D, v25.2S, v4.2S +// usra v18.2D, v20.2D, #32 +// mov x7, v19.d[0] +// mov x17, v19.d[1] +// mul x16, x4, x4 +// umulh x3, x9, x2 +// adds x15, x1, x3 +// umulh x1, x9, x4 +// adcs x13, x13, x1 +// umulh x1, x2, x4 +// adcs x8, x1, xzr +// mov x11, v18.d[0] +// mov x14, v18.d[1] +// umulh x1, x4, x4 +// adds x3, x12, x12 +// adcs x15, x15, x15 +// adcs x13, x13, x13 +// adcs x12, x8, x8 +// adc x1, x1, xzr +// adds x11, x11, x3 +// adcs x3, x17, x15 +// adcs x17, x14, x13 +// adcs x15, x16, x12 +// adc x13, x1, xzr +// lsl x1, x7, #32 +// add x16, x1, x7 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x12, x1, x12, #32 +// lsr x1, x1, #32 +// adds x7, x1, x16 +// adc x1, xzr, xzr +// subs x12, x11, x12 +// sbcs x11, x3, x7 +// sbcs x17, x17, x1 +// sbcs x15, x15, xzr +// sbcs x13, x13, xzr +// sbc x3, x16, xzr +// lsl x1, x12, #32 +// add x16, x1, x12 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x12, x1, x12, #32 +// lsr x1, x1, #32 +// adds x7, x1, x16 +// adc x1, xzr, xzr +// subs x12, x11, x12 +// sbcs x17, x17, x7 +// sbcs x15, x15, x1 +// sbcs x13, x13, xzr +// sbcs x11, x3, xzr +// sbc x3, x16, xzr +// lsl x1, x12, #32 +// add x16, x1, x12 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x7, x1, x12, #32 +// lsr x1, x1, #32 +// adds x12, x1, x16 +// adc x1, xzr, xzr +// subs x17, x17, x7 +// sbcs x15, x15, x12 +// sbcs x13, x13, x1 +// sbcs x7, x11, xzr +// sbcs x12, x3, xzr +// sbc x1, x16, xzr +// stp x17, x15, [x0] // @slothy:writes=buffer0 +// stp x13, x7, [x0, #16] // @slothy:writes=buffer16 +// stp x12, x1, [x0, #32] // @slothy:writes=buffer32 +// mul x14, x9, x6 +// mul x15, x2, x5 +// mul x13, x4, x10 +// umulh x7, x9, x6 +// umulh x12, x2, x5 +// umulh x1, x4, x10 +// adds x15, x7, x15 +// adcs x16, x12, x13 +// adc x13, x1, xzr +// adds x11, x15, x14 +// adcs x7, x16, x15 +// adcs x12, x13, x16 +// adc x1, x13, xzr +// adds x17, x7, x14 +// adcs x15, x12, x15 +// adcs x3, x1, x16 +// adc x16, x13, xzr +// subs x1, x9, x2 +// cneg x13, x1, cc +// csetm x7, cc +// subs x1, x5, x6 +// cneg x1, x1, cc +// mul x12, x13, x1 +// umulh x1, x13, x1 +// cinv x7, x7, cc +// eor x12, x12, x7 +// eor x1, x1, x7 +// cmn x7, #0x1 +// adcs x11, x11, x12 +// adcs x17, x17, x1 +// adcs x15, x15, x7 +// adcs x3, x3, x7 +// adc x16, x16, x7 +// subs x9, x9, x4 +// cneg x13, x9, cc +// csetm x7, cc +// subs x1, x10, x6 +// cneg x1, x1, cc +// mul x12, x13, x1 +// umulh x1, x13, x1 +// cinv x7, x7, cc +// eor x12, x12, x7 +// eor x1, x1, x7 +// cmn x7, #0x1 +// adcs x17, x17, x12 +// adcs x15, x15, x1 +// adcs x13, x3, x7 +// adc x7, x16, x7 +// subs x2, x2, x4 +// cneg x12, x2, cc +// csetm x1, cc +// subs x2, x10, x5 +// cneg x2, x2, cc +// mul x4, x12, x2 +// umulh x2, x12, x2 +// cinv x1, x1, cc +// eor x4, x4, x1 +// eor x2, x2, x1 +// cmn x1, #0x1 +// adcs x12, x15, x4 +// adcs x4, x13, x2 +// adc x2, x7, x1 +// adds x1, x14, x14 +// adcs x16, x11, x11 +// adcs x17, x17, x17 +// adcs x15, x12, x12 +// adcs x13, x4, x4 +// adcs x7, x2, x2 +// adc x12, xzr, xzr +// ldp x4, x2, [x0] // @slothy:reads=buffer0 +// adds x1, x1, x4 +// adcs x16, x16, x2 +// ldp x4, x2, [x0, #16] // @slothy:reads=buffer16 +// adcs x17, x17, x4 +// adcs x15, x15, x2 +// ldp x4, x2, [x0, #32] // @slothy:reads=buffer32 +// adcs x13, x13, x4 +// adcs x7, x7, x2 +// adc x11, x12, xzr +// lsl x2, x1, #32 +// add x12, x2, x1 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x4, x2, x4, #32 +// lsr x2, x2, #32 +// adds x1, x2, x12 +// adc x2, xzr, xzr +// subs x4, x16, x4 +// sbcs x16, x17, x1 +// sbcs x17, x15, x2 +// sbcs x15, x13, xzr +// sbcs x13, x7, xzr +// sbc x7, x12, xzr +// lsl x2, x4, #32 +// add x12, x2, x4 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x4, x2, x4, #32 +// lsr x2, x2, #32 +// adds x1, x2, x12 +// adc x2, xzr, xzr +// subs x4, x16, x4 +// sbcs x16, x17, x1 +// sbcs x17, x15, x2 +// sbcs x15, x13, xzr +// sbcs x13, x7, xzr +// sbc x7, x12, xzr +// lsl x2, x4, #32 +// add x12, x2, x4 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x1, x2, x4, #32 +// lsr x2, x2, #32 +// adds x4, x2, x12 +// adc x2, xzr, xzr +// subs x3, x16, x1 +// sbcs x17, x17, x4 +// sbcs x15, x15, x2 +// sbcs x1, x13, xzr +// sbcs x4, x7, xzr +// sbc x2, x12, xzr +// adds x13, x11, x1 +// adcs x7, x4, xzr +// adcs x12, x2, xzr +// adcs x16, xzr, xzr +// mul x2, x6, x6 +// adds x3, x3, x2 +// xtn v30.2S, v28.2D +// shrn v26.2S, v28.2D, #32 +// umull v26.2D, v30.2S, v26.2S +// shl v19.2D, v26.2D, #33 +// umlal v19.2D, v30.2S, v30.2S +// mov x1, v19.d[0] +// mov x4, v19.d[1] +// umulh x2, x6, x6 +// adcs x17, x17, x2 +// umulh x2, x5, x5 +// adcs x15, x15, x1 +// adcs x13, x13, x2 +// umulh x2, x10, x10 +// adcs x7, x7, x4 +// adcs x12, x12, x2 +// adc x16, x16, xzr +// dup v28.2D, x6 +// movi v0.2D, #0x00000000ffffffff +// uzp2 v5.4S, v21.4S, v21.4S +// xtn v25.2S, v28.2D +// xtn v4.2S, v21.2D +// rev64 v19.4S, v21.4S +// umull v30.2D, v25.2S, v4.2S +// umull v23.2D, v25.2S, v5.2S +// uzp2 v20.4S, v28.4S, v28.4S +// mul v19.4S, v19.4S, v28.4S +// usra v23.2D, v30.2D, #32 +// umull v18.2D, v20.2S, v5.2S +// uaddlp v19.2D, v19.4S +// and v30.16B, v23.16B, v0.16B +// umlal v30.2D, v20.2S, v4.2S +// shl v19.2D, v19.2D, #32 +// usra v18.2D, v23.2D, #32 +// umlal v19.2D, v25.2S, v4.2S +// usra v18.2D, v30.2D, #32 +// mov x6, v19.d[0] +// mov x1, v19.d[1] +// mul x4, x5, x10 +// mov x2, v18.d[0] +// adds x1, x1, x2 +// mov x2, v18.d[1] +// adcs x4, x4, x2 +// umulh x5, x5, x10 +// adc x2, x5, xzr +// adds x5, x6, x6 +// adcs x6, x1, x1 +// adcs x1, x4, x4 +// adcs x4, x2, x2 +// adc x2, xzr, xzr +// adds x17, x17, x5 +// adcs x15, x15, x6 +// adcs x13, x13, x1 +// adcs x7, x7, x4 +// adcs x12, x12, x2 +// adc x2, x16, xzr +// mov x5, #0xffffffff00000001 +// mov x6, #0xffffffff +// mov x1, #0x1 +// cmn x3, x5 +// adcs xzr, x17, x6 +// adcs xzr, x15, x1 +// adcs xzr, x13, xzr +// adcs xzr, x7, xzr +// adcs xzr, x12, xzr +// adc x2, x2, xzr +// neg x4, x2 +// and x2, x5, x4 +// adds x10, x3, x2 +// and x2, x6, x4 +// adcs x5, x17, x2 +// and x2, x1, x4 +// adcs x6, x15, x2 +// adcs x1, x13, xzr +// adcs x4, x7, xzr +// adc x2, x12, xzr +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x6, x1, [x0, #16] // @slothy:writes=buffer16 +// stp x4, x2, [x0, #32] // @slothy:writes=buffer32 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32]" +// export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p384_neon): + + ldr q1, [x1] + ldp x9, x2, [x1] + ldr q0, [x1] + ldp x4, x6, [x1, #16] + rev64 v21.4S, v1.4S + uzp2 v28.4S, v1.4S, v1.4S + umulh x7, x9, x2 + xtn v17.2S, v1.2D + mul v27.4S, v21.4S, v0.4S + ldr q20, [x1, #32] + xtn v30.2S, v0.2D + ldr q1, [x1, #32] + uzp2 v31.4S, v0.4S, v0.4S + ldp x5, x10, [x1, #32] + umulh x8, x9, x4 + uaddlp v3.2D, v27.4S + umull v16.2D, v30.2S, v17.2S + mul x16, x9, x4 + umull v27.2D, v30.2S, v28.2S + shrn v0.2S, v20.2D, #32 + xtn v7.2S, v20.2D + shl v20.2D, v3.2D, #32 + umull v3.2D, v31.2S, v28.2S + mul x3, x2, x4 + umlal v20.2D, v30.2S, v17.2S + umull v22.2D, v7.2S, v0.2S + usra v27.2D, v16.2D, #32 + umulh x11, x2, x4 + movi v21.2D, #0x00000000ffffffff + uzp2 v28.4S, v1.4S, v1.4S + adds x15, x16, x7 + and v5.16B, v27.16B, v21.16B + adcs x3, x3, x8 + usra v3.2D, v27.2D, #32 + dup v29.2D, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2D, v31.2S, v17.2S + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2D, v22.2D, #33 + xtn v25.2S, v29.2D + rev64 v31.4S, v1.4S + lsl x13, x14, #32 + uzp2 v6.4S, v29.4S, v29.4S + umlal v19.2D, v7.2S, v7.2S + usra v3.2D, v5.2D, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4S, v31.4S, v29.4S + xtn v4.2S, v1.2D + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2D, v25.2S, v28.2S + adcs x11, x16, x16 + umull v21.2D, v25.2S, v4.2S + mov x17, v3.d[0] + umull v18.2D, v6.2S, v28.2S + adc x16, x8, xzr + uaddlp v16.2D, v17.4S + movi v1.2D, #0x00000000ffffffff + subs x13, x13, x12 + usra v31.2D, v21.2D, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2D, v16.2D, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16B, v31.16B, v1.16B + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2D, v6.2S, v4.2S + usra v18.2D, v31.2D, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2D, v25.2S, v4.2S + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2D, v3.2D, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [x0] // @slothy:writes=buffer0 + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [x0, #16] // @slothy:writes=buffer16 + csetm x15, cc + cneg x1, x1, cc + stp x11, x14, [x0, #32] // @slothy:writes=buffer32 + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc + cinv x16, x15, cc + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc + cneg x9, x9, cc + subs x4, x2, x4 + cneg x4, x4, cc + csetm x7, cc + subs x2, x10, x6 + cinv x8, x8, cc + cneg x2, x2, cc + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc + cneg x1, x1, cc + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [x0, #16] // @slothy:reads=buffer16 + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [x0] // @slothy:reads=buffer0 + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [x0, #32] // @slothy:reads=buffer32 + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 + adcs x14, x14, x2 + mov x2, #0x1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [x0] // @slothy:writes=buffer0 + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [x0, #16] // @slothy:writes=buffer16 + adc x17, x14, xzr + stp x2, x17, [x0, #32] // depth 72 // @slothy:writes=buffer32 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From 0462e615cbeb35669d0c545b52bb785628d7c50a Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Thu, 6 Jun 2024 12:02:20 -0500 Subject: [PATCH 23/24] Add `bignum_mont{sqr,mul}_p521_neon` This patch adds `bignum_mont{sqr,mul}_p521_neon`. ``` bignum_montsqr_p521 : 114.7 ns each (var 0.2%, corr 0.06) = 8720010 ops/sec bignum_montsqr_p521_neon : 83.8 ns each (var 0.4%, corr -0.04) = 11926387 ops/sec bignum_montmul_p521 : 130.8 ns each (var 0.2%, corr -0.00) = 7644702 ops/sec bignum_montmul_p521_neon : 111.4 ns each (var 0.2%, corr 0.04) = 8978421 ops/sec ``` The new subroutine specs are added to specification.txt, and test as well as benchmark are updated. Modular squaring/multiplication functions are not included in this patch. This patch also contains the following updates: - A tactic for showing equivalence of loops is added (the tactic is not used yet). - Definitions for input state equivalence are canonicalized as `.. /\ (?a. read c1 s = a /\ read c1 s' = a /\ (?b. read c2 s = b /\ read c2 s' = b /\ ( ... )))` - Minor buggy behaviors in equiv tactics are fixed and performance improvements done s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/65f046ecbfa1842806720f352d3181b7caf99ce3 --- arm/p521/Makefile | 2 + arm/p521/bignum_montmul_p521_neon.S | 1415 +++++++++++++++++++++++++++ arm/p521/bignum_montsqr_p521_neon.S | 1124 +++++++++++++++++++++ 3 files changed, 2541 insertions(+) create mode 100644 arm/p521/bignum_montmul_p521_neon.S create mode 100644 arm/p521/bignum_montsqr_p521_neon.S diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 7980bdd9d4..64db072532 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -32,8 +32,10 @@ OBJ = bignum_add_p521.o \ bignum_mod_p521_9.o \ bignum_montmul_p521.o \ bignum_montmul_p521_alt.o \ + bignum_montmul_p521_neon.o \ bignum_montsqr_p521.o \ bignum_montsqr_p521_alt.o \ + bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ bignum_neg_p521.o \ diff --git a/arm/p521/bignum_montmul_p521_neon.S b/arm/p521/bignum_montmul_p521_neon.S new file mode 100644 index 0000000000..9586339f95 --- /dev/null +++ b/arm/p521/bignum_montmul_p521_neon.S @@ -0,0 +1,1415 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^576) mod p_521 +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_montmul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This +// means the Montgomery base is the "native size" 2^{9*64} = 2^576; since +// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521 +// can be considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montmul_p521_neon is functionally equivalent to bignum_montmul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x14, x7, [x1] +// ldp x3, x25, [x1, #16] +// ldp x10, x24, [x2] +// ldr q0, [x1] +// ldr q25, [x2] +// ldp x12, x6, [x2, #16] +// movi v18.2D, #0x00000000ffffffff +// uzp2 v3.4S, v25.4S, v25.4S +// xtn v26.2S, v0.2D +// xtn v22.2S, v25.2D +// rev64 v24.4S, v25.4S +// umull v19.2D, v26.2S, v22.2S +// umull v25.2D, v26.2S, v3.2S +// uzp2 v20.4S, v0.4S, v0.4S +// mul v0.4S, v24.4S, v0.4S +// usra v25.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v3.2S +// uaddlp v0.2D, v0.4S +// and v18.16B, v25.16B, v18.16B +// umlal v18.2D, v20.2S, v22.2S +// shl v0.2D, v0.2D, #32 +// usra v6.2D, v25.2D, #32 +// umlal v0.2D, v26.2S, v22.2S +// usra v6.2D, v18.2D, #32 +// mov x23, v0.d[0] +// mov x16, v0.d[1] +// mul x5, x3, x12 +// mul x21, x25, x6 +// mov x19, v6.d[0] +// adds x16, x16, x19 +// mov x19, v6.d[1] +// adcs x5, x5, x19 +// umulh x19, x3, x12 +// adcs x21, x21, x19 +// umulh x19, x25, x6 +// adc x19, x19, xzr +// adds x8, x16, x23 +// adcs x16, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adc x19, xzr, x19 +// adds x11, x16, x23 +// adcs x15, x5, x8 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// adcs x21, xzr, x21 +// adc x19, xzr, x19 +// subs x20, x3, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x12 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x5, x5, x13 +// eor x20, x20, x9 +// adcs x21, x21, x20 +// adc x19, x19, x9 +// subs x20, x14, x7 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x24, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x8, x8, x13 +// eor x20, x20, x9 +// adcs x11, x11, x20 +// adcs x15, x15, x9 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x7, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x16, x16, x13 +// eor x20, x20, x9 +// adcs x5, x5, x20 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x14, x3 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x12, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x11, x11, x13 +// eor x20, x20, x9 +// adcs x15, x15, x20 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x25, x14, x25 +// cneg x25, x25, cc +// csetm x20, cc +// subs x10, x6, x10 +// cneg x10, x10, cc +// mul x6, x25, x10 +// umulh x25, x25, x10 +// cinv x10, x20, cc +// cmn x10, #0x1 +// eor x6, x6, x10 +// adcs x6, x15, x6 +// eor x25, x25, x10 +// adcs x25, x16, x25 +// adcs x16, x5, x10 +// adcs x5, x21, x10 +// adc x10, x19, x10 +// subs x7, x7, x3 +// cneg x7, x7, cc +// csetm x3, cc +// subs x24, x12, x24 +// cneg x24, x24, cc +// mul x12, x7, x24 +// umulh x7, x7, x24 +// cinv x3, x3, cc +// cmn x3, #0x1 +// eor x24, x12, x3 +// adcs x24, x6, x24 +// eor x7, x7, x3 +// adcs x7, x25, x7 +// adcs x25, x16, x3 +// adcs x12, x5, x3 +// adc x3, x10, x3 +// lsl x10, x23, #9 +// extr x6, x8, x23, #55 +// extr x23, x11, x8, #55 +// extr x16, x24, x11, #55 +// lsr x24, x24, #55 +// stp x7, x25, [sp] // @slothy:writes=stack0 +// stp x12, x3, [sp, #16] // @slothy:writes=stack16 +// stp x10, x6, [sp, #32] // @slothy:writes=stack32 +// stp x23, x16, [sp, #48] // @slothy:writes=stack48 +// str x24, [sp, #64] // @slothy:writes=stack64 +// ldp x7, x3, [x1, #32] +// ldr q0, [x1, #32] +// ldp x25, x10, [x1, #48] +// ldp x24, x12, [x2, #32] +// ldr q25, [x2, #32] +// ldp x6, x23, [x2, #48] +// ldr q18, [x1, #48] +// ldr q3, [x2, #48] +// uzp1 v26.4S, v25.4S, v0.4S +// rev64 v25.4S, v25.4S +// uzp1 v22.4S, v0.4S, v0.4S +// mul v0.4S, v25.4S, v0.4S +// uaddlp v0.2D, v0.4S +// shl v0.2D, v0.2D, #32 +// umlal v0.2D, v22.2S, v26.2S +// mov x16, v0.d[0] +// mov x5, v0.d[1] +// movi v0.2D, #0x00000000ffffffff +// uzp2 v25.4S, v3.4S, v3.4S +// xtn v26.2S, v18.2D +// xtn v22.2S, v3.2D +// rev64 v24.4S, v3.4S +// umull v19.2D, v26.2S, v22.2S +// umull v3.2D, v26.2S, v25.2S +// uzp2 v20.4S, v18.4S, v18.4S +// mul v18.4S, v24.4S, v18.4S +// usra v3.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v25.2S +// uaddlp v25.2D, v18.4S +// and v0.16B, v3.16B, v0.16B +// umlal v0.2D, v20.2S, v22.2S +// shl v25.2D, v25.2D, #32 +// usra v6.2D, v3.2D, #32 +// umlal v25.2D, v26.2S, v22.2S +// usra v6.2D, v0.2D, #32 +// mov x21, v25.d[0] +// mov x19, v25.d[1] +// umulh x8, x7, x24 +// adds x5, x5, x8 +// umulh x8, x3, x12 +// adcs x21, x21, x8 +// mov x8, v6.d[0] +// adcs x19, x19, x8 +// mov x8, v6.d[1] +// adc x8, x8, xzr +// adds x11, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adcs x19, x8, x19 +// adc x8, xzr, x8 +// adds x15, x5, x16 +// adcs x20, x21, x11 +// adcs x5, x19, x5 +// adcs x21, x8, x21 +// adcs x19, xzr, x19 +// adc x8, xzr, x8 +// subs x9, x25, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x6 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x21, x21, x26 +// eor x9, x9, x13 +// adcs x19, x19, x9 +// adc x8, x8, x13 +// subs x9, x7, x3 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x12, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x11, x11, x26 +// eor x9, x9, x13 +// adcs x15, x15, x9 +// adcs x20, x20, x13 +// adcs x5, x5, x13 +// adcs x21, x21, x13 +// adcs x19, x19, x13 +// adc x8, x8, x13 +// subs x9, x3, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x5, x5, x26 +// eor x9, x9, x13 +// adcs x14, x21, x9 +// adcs x21, x19, x13 +// adc x19, x8, x13 +// subs x9, x7, x25 +// cneg x8, x9, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x8, x13 +// umulh x8, x8, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x15, x15, x13 +// eor x8, x8, x9 +// adcs x8, x20, x8 +// adcs x5, x5, x9 +// adcs x20, x14, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x9, x7, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// subs x9, x3, x25 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x6, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// ldp x9, x13, [sp] // @slothy:reads=stack0 +// adds x16, x16, x9 +// adcs x11, x11, x13 +// stp x16, x11, [sp] // @slothy:writes=stack0 +// ldp x16, x11, [sp, #16] // @slothy:reads=stack16 +// adcs x16, x15, x16 +// adcs x8, x8, x11 +// stp x16, x8, [sp, #16] // @slothy:writes=stack16 +// ldp x16, x8, [sp, #32] // @slothy:reads=stack32 +// adcs x16, x5, x16 +// adcs x5, x20, x8 +// stp x16, x5, [sp, #32] // @slothy:writes=stack32 +// ldp x16, x5, [sp, #48] // @slothy:reads=stack48 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// stp x16, x5, [sp, #48] // @slothy:writes=stack48 +// ldr x16, [sp, #64] // @slothy:reads=stack64 +// adc x16, x16, xzr +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x16, x5, [x1] +// subs x7, x7, x16 +// sbcs x3, x3, x5 +// ldp x16, x5, [x1, #16] +// sbcs x25, x25, x16 +// sbcs x10, x10, x5 +// csetm x16, cc +// ldp x5, x21, [x2] +// subs x24, x5, x24 +// sbcs x12, x21, x12 +// ldp x5, x19, [x2, #16] +// sbcs x6, x5, x6 +// sbcs x23, x19, x23 +// csetm x5, cc +// eor x7, x7, x16 +// subs x7, x7, x16 +// eor x3, x3, x16 +// sbcs x3, x3, x16 +// eor x25, x25, x16 +// sbcs x25, x25, x16 +// eor x10, x10, x16 +// sbc x10, x10, x16 +// eor x24, x24, x5 +// subs x24, x24, x5 +// eor x12, x12, x5 +// sbcs x12, x12, x5 +// eor x6, x6, x5 +// sbcs x6, x6, x5 +// eor x23, x23, x5 +// sbc x23, x23, x5 +// eor x16, x5, x16 +// mul x21, x7, x24 +// mul x5, x3, x12 +// mul x19, x25, x6 +// mul x8, x10, x23 +// umulh x11, x7, x24 +// adds x5, x5, x11 +// umulh x11, x3, x12 +// adcs x19, x19, x11 +// umulh x11, x25, x6 +// adcs x8, x8, x11 +// umulh x11, x10, x23 +// adc x11, x11, xzr +// adds x15, x5, x21 +// adcs x5, x19, x5 +// adcs x19, x8, x19 +// adcs x8, x11, x8 +// adc x11, xzr, x11 +// adds x20, x5, x21 +// adcs x9, x19, x15 +// adcs x5, x8, x5 +// adcs x19, x11, x19 +// adcs x8, xzr, x8 +// adc x11, xzr, x11 +// subs x13, x25, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x6 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x19, x19, x22 +// eor x13, x13, x26 +// adcs x8, x8, x13 +// adc x11, x11, x26 +// subs x13, x7, x3 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x12, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x15, x15, x22 +// eor x13, x13, x26 +// adcs x20, x20, x13 +// adcs x9, x9, x26 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x3, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x12 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x5, x5, x22 +// eor x13, x13, x26 +// adcs x19, x19, x13 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x7, x25 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x6, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x20, x20, x22 +// eor x13, x13, x26 +// adcs x9, x9, x13 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x7, x7, x10 +// cneg x7, x7, cc +// csetm x10, cc +// subs x24, x23, x24 +// cneg x24, x24, cc +// mul x23, x7, x24 +// umulh x7, x7, x24 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x24, x23, x10 +// adcs x24, x9, x24 +// eor x7, x7, x10 +// adcs x7, x5, x7 +// adcs x23, x19, x10 +// adcs x5, x8, x10 +// adc x10, x11, x10 +// subs x3, x3, x25 +// cneg x3, x3, cc +// csetm x25, cc +// subs x12, x6, x12 +// cneg x12, x12, cc +// mul x6, x3, x12 +// umulh x3, x3, x12 +// cinv x25, x25, cc +// cmn x25, #0x1 +// eor x12, x6, x25 +// adcs x24, x24, x12 +// eor x3, x3, x25 +// adcs x7, x7, x3 +// adcs x3, x23, x25 +// adcs x12, x5, x25 +// adc x25, x10, x25 +// ldp x10, x6, [sp] // @slothy:reads=stack0 +// ldp x23, x5, [sp, #16] // @slothy:reads=stack16 +// eor x21, x21, x16 +// adds x21, x21, x10 +// eor x19, x15, x16 +// adcs x19, x19, x6 +// eor x8, x20, x16 +// adcs x8, x8, x23 +// eor x24, x24, x16 +// adcs x24, x24, x5 +// eor x7, x7, x16 +// ldp x11, x15, [sp, #32] // @slothy:reads=stack32 +// ldp x20, x9, [sp, #48] // @slothy:reads=stack48 +// ldr x13, [sp, #64] // @slothy:reads=stack64 +// adcs x7, x7, x11 +// eor x3, x3, x16 +// adcs x3, x3, x15 +// eor x12, x12, x16 +// adcs x12, x12, x20 +// eor x25, x25, x16 +// adcs x25, x25, x9 +// adc x26, x13, xzr +// adds x7, x7, x10 +// adcs x3, x3, x6 +// adcs x10, x12, x23 +// adcs x25, x25, x5 +// and x12, x16, #0x1ff +// lsl x6, x21, #9 +// orr x12, x6, x12 +// adcs x12, x11, x12 +// extr x6, x19, x21, #55 +// adcs x6, x15, x6 +// extr x23, x8, x19, #55 +// adcs x23, x20, x23 +// extr x16, x24, x8, #55 +// adcs x16, x9, x16 +// lsr x24, x24, #55 +// adc x24, x24, x13 +// ldr x5, [x2, #64] +// ldp x21, x19, [x1] +// and x8, x21, #0xfffffffffffff +// mul x8, x5, x8 +// ldr x11, [x1, #64] +// ldp x15, x20, [x2] +// and x9, x15, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// extr x21, x19, x21, #52 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x15, x20, x15, #52 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x21, x21, x15 +// lsr x15, x8, #52 +// add x21, x21, x15 +// lsl x8, x8, #12 +// extr x8, x21, x8, #12 +// adds x7, x7, x8 +// ldp x8, x15, [x1, #16] +// ldp x9, x13, [x2, #16] +// extr x19, x8, x19, #40 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// extr x20, x9, x20, #40 +// and x20, x20, #0xfffffffffffff +// mul x20, x11, x20 +// add x19, x19, x20 +// lsr x20, x21, #52 +// add x19, x19, x20 +// lsl x21, x21, #12 +// extr x21, x19, x21, #24 +// adcs x3, x3, x21 +// extr x21, x15, x8, #28 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x8, x13, x9, #28 +// and x8, x8, #0xfffffffffffff +// mul x8, x11, x8 +// add x21, x21, x8 +// lsr x8, x19, #52 +// add x21, x21, x8 +// lsl x19, x19, #12 +// extr x19, x21, x19, #36 +// adcs x10, x10, x19 +// and x19, x3, x10 +// ldp x8, x20, [x1, #32] +// ldp x9, x22, [x2, #32] +// extr x15, x8, x15, #16 +// and x15, x15, #0xfffffffffffff +// mul x4, x5, x15 +// extr x15, x9, x13, #16 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x15, x4, x15 +// lsl x13, x26, #48 +// add x15, x15, x13 +// lsr x13, x21, #52 +// add x15, x15, x13 +// lsl x21, x21, #12 +// extr x21, x15, x21, #48 +// adcs x25, x25, x21 +// and x21, x19, x25 +// lsr x19, x8, #4 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// lsr x26, x9, #4 +// and x13, x26, #0xfffffffffffff +// mul x26, x11, x13 +// add x19, x19, x26 +// lsr x13, x15, #52 +// add x19, x19, x13 +// lsl x15, x15, #12 +// extr x15, x19, x15, #60 +// extr x8, x20, x8, #56 +// and x8, x8, #0xfffffffffffff +// mul x8, x5, x8 +// extr x9, x22, x9, #56 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// lsr x19, x19, #52 +// add x19, x8, x19 +// lsl x8, x15, #8 +// extr x8, x19, x8, #8 +// adcs x12, x12, x8 +// and x21, x21, x12 +// ldp x1, x8, [x1, #48] +// ldp x2, x15, [x2, #48] +// extr x20, x1, x20, #44 +// and x20, x20, #0xfffffffffffff +// mul x20, x5, x20 +// extr x9, x2, x22, #44 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x20, x20, x9 +// lsr x9, x19, #52 +// add x22, x20, x9 +// lsl x19, x19, #12 +// extr x19, x22, x19, #20 +// adcs x6, x6, x19 +// and x21, x21, x6 +// extr x1, x8, x1, #32 +// and x1, x1, #0xfffffffffffff +// mul x1, x5, x1 +// extr x2, x15, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x11, x2 +// add x2, x1, x2 +// lsr x1, x22, #52 +// add x2, x2, x1 +// lsl x1, x22, #12 +// extr x1, x2, x1, #32 +// adcs x23, x23, x1 +// and x21, x21, x23 +// lsr x1, x8, #20 +// mul x1, x5, x1 +// lsr x19, x15, #20 +// mul x19, x11, x19 +// add x1, x1, x19 +// lsr x19, x2, #52 +// add x19, x1, x19 +// lsl x2, x2, #12 +// extr x2, x19, x2, #44 +// adcs x16, x16, x2 +// and x2, x21, x16 +// mul x5, x5, x11 +// lsr x1, x19, #44 +// add x5, x5, x1 +// adc x24, x24, x5 +// lsr x5, x24, #9 +// orr x24, x24, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x7, x5 +// adcs xzr, x2, xzr +// adcs xzr, x24, xzr +// adcs x7, x7, x5 +// adcs x2, x3, xzr +// adcs x10, x10, xzr +// adcs x25, x25, xzr +// adcs x12, x12, xzr +// adcs x6, x6, xzr +// adcs x23, x23, xzr +// adcs x16, x16, xzr +// adc x3, x24, xzr +// stp x2, x10, [x0] // @slothy:writes=buffer0 +// stp x25, x12, [x0, #16] // @slothy:writes=buffer16 +// stp x6, x23, [x0, #32] // @slothy:writes=buffer32 +// lsl x25, x7, #9 +// and x3, x3, #0x1ff +// orr x3, x3, x25 +// stp x16, x3, [x0, #48] // @slothy:writes=buffer48 +// lsr x14, x7, #55 +// str x14, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q24, [x2] + ldr q21, [x1] + ldr q1, [x2, #48] + ldp x23, x20, [x1, #16] + movi v18.2D, #0x00000000ffffffff + ldp x19, x17, [x2, #16] + uzp2 v3.4S, v24.4S, v24.4S + xtn v6.2S, v21.2D + ldp x11, x22, [x1] + rev64 v5.4S, v24.4S + xtn v24.2S, v24.2D + subs x16, x23, x20 + umull v29.2D, v6.2S, v3.2S + rev64 v31.4S, v1.4S + cneg x26, x16, cc + umull v27.2D, v6.2S, v24.2S + ldr q19, [x1, #48] + csetm x12, cc + mul x15, x20, x17 + mul v26.4S, v5.4S, v21.4S + uzp2 v28.4S, v21.4S, v21.4S + subs x6, x17, x19 + xtn v7.2S, v1.2D + cinv x10, x12, cc + cneg x3, x6, cc + uzp2 v21.4S, v1.4S, v1.4S + umull v1.2D, v28.2S, v3.2S + mul x12, x26, x3 + usra v29.2D, v27.2D, #32 + mul v25.4S, v31.4S, v19.4S + usra v1.2D, v29.2D, #32 + uaddlp v31.2D, v26.4S + umulh x14, x26, x3 + eor x12, x12, x10 + and v26.16B, v29.16B, v18.16B + uaddlp v2.2D, v25.4S + subs x16, x11, x22 + shl v0.2D, v31.2D, #32 + xtn v31.2S, v19.2D + cneg x6, x16, cc + shl v16.2D, v2.2D, #32 + umlal v26.2D, v28.2S, v24.2S + umlal v0.2D, v6.2S, v24.2S + uzp2 v30.4S, v19.4S, v19.4S + umulh x26, x20, x17 + umull v22.2D, v31.2S, v21.2S + umull v29.2D, v30.2S, v21.2S + usra v1.2D, v26.2D, #32 + mul x13, x23, x19 + eor x9, x14, x10 + ldr q5, [x2, #32] + umull v26.2D, v31.2S, v7.2S + ldp x21, x4, [x2] + csetm x8, cc + mov x16, v0.d[1] + ldr q6, [x1, #32] + umlal v16.2D, v31.2S, v7.2S + mov x3, v0.d[0] + umulh x14, x23, x19 + mov x25, v1.d[1] + mov x5, v1.d[0] + usra v22.2D, v26.2D, #32 + rev64 v3.4S, v5.4S + adds x16, x16, x5 + uzp1 v24.4S, v5.4S, v6.4S + movi v26.2D, #0x00000000ffffffff + adcs x7, x13, x25 + uzp1 v0.4S, v6.4S, v6.4S + mul v5.4S, v3.4S, v6.4S + adcs x25, x15, x14 + adc x13, x26, xzr + adds x26, x16, x3 + and v6.16B, v22.16B, v26.16B + usra v29.2D, v22.2D, #32 + adcs x16, x7, x16 + adcs x14, x25, x7 + umlal v6.2D, v30.2S, v7.2S + adcs x7, x13, x25 + uaddlp v7.2D, v5.4S + adc x13, xzr, x13 + adds x25, x16, x3 + adcs x24, x14, x26 + shl v1.2D, v7.2D, #32 + adcs x5, x7, x16 + usra v29.2D, v6.2D, #32 + adcs x16, x13, x14 + umlal v1.2D, v0.2S, v24.2S + adcs x14, xzr, x7 + adc x13, xzr, x13 + subs x7, x4, x21 + cneg x7, x7, cc + mul x15, x6, x7 + umulh x7, x6, x7 + cinv x6, x8, cc + cmn x10, #0x1 + adcs x16, x16, x12 + eor x8, x15, x6 + adcs x14, x14, x9 + adc x9, x13, x10 + subs x13, x22, x20 + cneg x13, x13, cc + csetm x10, cc + subs x12, x17, x4 + cinv x15, x10, cc + cneg x10, x12, cc + cmn x6, #0x1 + umulh x12, x13, x10 + eor x7, x7, x6 + adcs x26, x26, x8 + adcs x7, x25, x7 + adcs x8, x24, x6 + adcs x24, x5, x6 + adcs x25, x16, x6 + mul x5, x13, x10 + adcs x13, x14, x6 + adc x14, x9, x6 + subs x10, x11, x23 + csetm x16, cc + cneg x9, x10, cc + subs x6, x19, x21 + cinv x10, x16, cc + cneg x16, x6, cc + eor x5, x5, x15 + subs x20, x11, x20 + mul x6, x9, x16 + csetm x11, cc + cneg x20, x20, cc + subs x17, x17, x21 + cneg x17, x17, cc + cinv x11, x11, cc + umulh x9, x9, x16 + eor x16, x12, x15 + subs x21, x22, x23 + cneg x22, x21, cc + eor x12, x6, x10 + csetm x6, cc + cmn x15, #0x1 + eor x9, x9, x10 + adcs x5, x24, x5 + umulh x23, x20, x17 + lsl x24, x3, #9 + adcs x25, x25, x16 + adcs x21, x13, x15 + adc x16, x14, x15 + subs x13, x19, x4 + cneg x14, x13, cc + cinv x15, x6, cc + cmn x10, #0x1 + mul x13, x20, x17 + extr x17, x26, x3, #55 + adcs x12, x7, x12 + adcs x8, x8, x9 + eor x19, x23, x11 + adcs x6, x5, x10 + eor x13, x13, x11 + mov x5, v29.d[0] + adcs x25, x25, x10 + extr x26, x12, x26, #55 + mul x4, x22, x14 + adcs x7, x21, x10 + stp x24, x17, [sp, #32] + ldp x20, x21, [x1, #48] + adc x24, x16, x10 + cmn x11, #0x1 + mov x16, v16.d[0] + umulh x17, x22, x14 + adcs x13, x8, x13 + eor x9, x4, x15 + adcs x10, x6, x19 + ldp x22, x23, [x1, #32] + adcs x3, x25, x11 + ldp x4, x19, [x2, #32] + eor x17, x17, x15 + adcs x7, x7, x11 + adc x14, x24, x11 + subs x6, x20, x21 + csetm x11, cc + cneg x8, x6, cc + cmn x15, #0x1 + umulh x25, x22, x4 + adcs x24, x13, x9 + adcs x10, x10, x17 + extr x13, x24, x12, #55 + adcs x9, x3, x15 + ldp x17, x3, [x2, #48] + umulh x6, x23, x19 + adcs x7, x7, x15 + adc x14, x14, x15 + subs x12, x22, x23 + stp x10, x9, [sp] + mov x9, v1.d[1] + csetm x10, cc + stp x7, x14, [sp, #16] + cneg x12, x12, cc + subs x14, x3, x17 + mov x7, v16.d[1] + cneg x15, x14, cc + mov x14, v29.d[1] + cinv x11, x11, cc + adds x9, x9, x25 + mul x25, x8, x15 + stp x26, x13, [sp, #48] + lsr x24, x24, #55 + adcs x26, x16, x6 + mov x13, v1.d[0] + str x24, [sp, #64] + adcs x7, x7, x5 + adc x5, x14, xzr + umulh x6, x8, x15 + eor x15, x25, x11 + subs x25, x19, x4 + cinv x16, x10, cc + cneg x10, x25, cc + eor x6, x6, x11 + adds x8, x9, x13 + adcs x14, x26, x9 + mul x9, x12, x10 + adcs x24, x7, x26 + adcs x7, x5, x7 + umulh x25, x12, x10 + adc x12, xzr, x5 + adds x26, x14, x13 + eor x10, x9, x16 + adcs x9, x24, x8 + adcs x5, x7, x14 + adcs x14, x12, x24 + adcs x7, xzr, x7 + adc x12, xzr, x12 + eor x24, x25, x16 + cmn x11, #0x1 + adcs x25, x14, x15 + adcs x14, x7, x6 + adc x11, x12, x11 + subs x12, x23, x21 + csetm x15, cc + cneg x7, x12, cc + subs x12, x3, x19 + cneg x12, x12, cc + cinv x15, x15, cc + cmn x16, #0x1 + adcs x6, x8, x10 + mul x10, x7, x12 + adcs x26, x26, x24 + adcs x9, x9, x16 + umulh x24, x7, x12 + eor x8, x10, x15 + adcs x5, x5, x16 + adcs x25, x25, x16 + adcs x7, x14, x16 + adc x16, x11, x16 + subs x11, x22, x20 + cneg x11, x11, cc + csetm x14, cc + subs x10, x17, x4 + cinv x14, x14, cc + cneg x10, x10, cc + cmn x15, #0x1 + eor x12, x24, x15 + adcs x5, x5, x8 + mul x24, x11, x10 + adcs x8, x25, x12 + adcs x25, x7, x15 + adc x16, x16, x15 + subs x12, x22, x21 + umulh x10, x11, x10 + cneg x15, x12, cc + csetm x11, cc + subs x12, x3, x4 + cneg x12, x12, cc + cinv x7, x11, cc + mul x11, x15, x12 + eor x24, x24, x14 + cmn x14, #0x1 + eor x10, x10, x14 + adcs x24, x26, x24 + eor x26, x11, x7 + adcs x10, x9, x10 + ldp x11, x9, [x1, #16] + umulh x15, x15, x12 + adcs x5, x5, x14 + adcs x8, x8, x14 + adcs x25, x25, x14 + adc x12, x16, x14 + cmn x7, #0x1 + adcs x16, x10, x26 + eor x14, x15, x7 + adcs x26, x5, x14 + ldp x5, x10, [x1] + adcs x14, x8, x7 + adcs x15, x25, x7 + adc x7, x12, x7 + subs x25, x23, x20 + cneg x25, x25, cc + csetm x8, cc + subs x22, x22, x5 + sbcs x10, x23, x10 + ldp x23, x12, [x2] + sbcs x20, x20, x11 + sbcs x21, x21, x9 + csetm x9, cc + subs x11, x17, x19 + cneg x5, x11, cc + cinv x11, x8, cc + subs x23, x23, x4 + sbcs x19, x12, x19 + eor x20, x20, x9 + ldp x12, x4, [x2, #16] + eor x21, x21, x9 + umulh x8, x25, x5 + eor x22, x22, x9 + eor x10, x10, x9 + sbcs x17, x12, x17 + sbcs x3, x4, x3 + mul x25, x25, x5 + csetm x12, cc + subs x22, x22, x9 + eor x4, x23, x12 + sbcs x23, x10, x9 + eor x10, x3, x12 + sbcs x20, x20, x9 + eor x5, x8, x11 + eor x3, x19, x12 + sbc x21, x21, x9 + subs x4, x4, x12 + eor x25, x25, x11 + sbcs x19, x3, x12 + eor x3, x17, x12 + sbcs x17, x3, x12 + umulh x8, x23, x19 + sbc x3, x10, x12 + cmn x11, #0x1 + adcs x25, x16, x25 + adcs x26, x26, x5 + ldp x10, x5, [sp] + adcs x16, x14, x11 + mul x14, x22, x4 + adcs x15, x15, x11 + adc x7, x7, x11 + adds x11, x13, x10 + umulh x10, x21, x3 + adcs x13, x6, x5 + ldp x6, x5, [sp, #16] + stp x11, x13, [sp] + eor x13, x12, x9 + mul x9, x23, x19 + adcs x6, x24, x6 + ldp x11, x24, [sp, #32] + mul x12, x20, x17 + adcs x25, x25, x5 + stp x6, x25, [sp, #16] + ldp x6, x25, [sp, #48] + umulh x5, x20, x17 + adcs x11, x26, x11 + ldr x26, [sp, #64] + adcs x16, x16, x24 + stp x11, x16, [sp, #32] + adcs x11, x15, x6 + umulh x24, x22, x4 + adcs x25, x7, x25 + adc x7, x26, xzr + stp x11, x25, [sp, #48] + subs x26, x20, x21 + csetm x15, cc + cneg x25, x26, cc + str x7, [sp, #64] + mul x11, x21, x3 + subs x6, x22, x23 + cneg x6, x6, cc + csetm x16, cc + subs x26, x3, x17 + cneg x26, x26, cc + cinv x7, x15, cc + adds x24, x9, x24 + adcs x8, x12, x8 + umulh x12, x25, x26 + adcs x5, x11, x5 + adc x11, x10, xzr + subs x15, x19, x4 + cinv x9, x16, cc + mul x26, x25, x26 + eor x25, x12, x7 + cneg x12, x15, cc + adds x16, x24, x14 + eor x15, x26, x7 + umulh x26, x6, x12 + adcs x10, x8, x24 + adcs x8, x5, x8 + adcs x24, x11, x5 + adc x5, xzr, x11 + adds x11, x10, x14 + mul x12, x6, x12 + adcs x6, x8, x16 + eor x14, x14, x13 + adcs x10, x24, x10 + adcs x8, x5, x8 + adcs x24, xzr, x24 + adc x5, xzr, x5 + cmn x7, #0x1 + adcs x15, x8, x15 + adcs x24, x24, x25 + eor x25, x26, x9 + adc x8, x5, x7 + eor x5, x12, x9 + subs x26, x23, x21 + cneg x12, x26, cc + csetm x26, cc + subs x7, x3, x19 + cneg x7, x7, cc + cinv x26, x26, cc + cmn x9, #0x1 + adcs x5, x16, x5 + mul x16, x12, x7 + adcs x25, x11, x25 + umulh x7, x12, x7 + adcs x12, x6, x9 + eor x11, x16, x26 + adcs x6, x10, x9 + adcs x10, x15, x9 + adcs x24, x24, x9 + adc x8, x8, x9 + subs x15, x22, x20 + cneg x15, x15, cc + csetm x9, cc + subs x16, x17, x4 + cneg x16, x16, cc + cinv x9, x9, cc + subs x21, x22, x21 + mul x22, x15, x16 + eor x7, x7, x26 + cneg x21, x21, cc + umulh x16, x15, x16 + csetm x15, cc + subs x4, x3, x4 + cneg x3, x4, cc + eor x4, x22, x9 + cinv x15, x15, cc + cmn x26, #0x1 + eor x22, x5, x13 + adcs x5, x6, x11 + adcs x6, x10, x7 + adcs x10, x24, x26 + eor x11, x16, x9 + adc x8, x8, x26 + subs x16, x23, x20 + cneg x7, x16, cc + csetm x23, cc + cmn x9, #0x1 + adcs x16, x25, x4 + mul x4, x21, x3 + adcs x24, x12, x11 + eor x11, x16, x13 + adcs x26, x5, x9 + adcs x16, x6, x9 + umulh x20, x21, x3 + adcs x6, x10, x9 + ldp x3, x10, [x1] + adc x12, x8, x9 + subs x21, x17, x19 + cneg x8, x21, cc + eor x25, x20, x15 + eor x20, x4, x15 + mul x19, x7, x8 + cinv x17, x23, cc + cmn x15, #0x1 + adcs x4, x24, x20 + extr x21, x10, x3, #52 + umulh x9, x7, x8 + and x24, x21, #0xfffffffffffff + adcs x26, x26, x25 + eor x7, x19, x17 + adcs x5, x16, x15 + and x23, x3, #0xfffffffffffff + eor x9, x9, x17 + adcs x21, x6, x15 + adc x6, x12, x15 + cmn x17, #0x1 + adcs x25, x4, x7 + and x4, x13, #0x1ff + ldp x16, x8, [sp] + adcs x20, x26, x9 + adcs x12, x5, x17 + ldp x3, x5, [sp, #16] + eor x15, x12, x13 + adcs x12, x21, x17 + adc x9, x6, x17 + adds x21, x14, x16 + lsl x7, x21, #9 + eor x26, x12, x13 + ldp x19, x17, [sp, #32] + orr x4, x7, x4 + eor x14, x25, x13 + adcs x7, x22, x8 + adcs x12, x11, x3 + eor x11, x20, x13 + ldp x6, x25, [sp, #48] + eor x20, x9, x13 + adcs x22, x14, x5 + ldr x14, [x2, #64] + adcs x9, x11, x19 + ldr x11, [sp, #64] + adcs x13, x15, x17 + adcs x26, x26, x6 + adcs x20, x20, x25 + adc x15, x11, xzr + adds x16, x9, x16 + mul x9, x14, x23 + adcs x23, x13, x8 + extr x13, x7, x21, #55 + adcs x21, x26, x3 + ldp x3, x26, [x1, #16] + extr x8, x22, x12, #55 + adcs x20, x20, x5 + adcs x19, x19, x4 + mul x4, x14, x24 + ldp x5, x24, [x2] + adcs x17, x17, x13 + extr x13, x26, x3, #28 + extr x10, x3, x10, #40 + extr x7, x12, x7, #55 + and x12, x13, #0xfffffffffffff + adcs x3, x6, x7 + ldr x6, [x1, #64] + extr x7, x24, x5, #52 + and x5, x5, #0xfffffffffffff + mul x12, x14, x12 + adcs x13, x25, x8 + and x7, x7, #0xfffffffffffff + ldp x8, x25, [x2, #16] + mul x5, x6, x5 + extr x24, x8, x24, #40 + and x24, x24, #0xfffffffffffff + add x9, x9, x5 + lsr x5, x22, #55 + mul x7, x6, x7 + extr x22, x25, x8, #28 + and x10, x10, #0xfffffffffffff + mul x10, x14, x10 + lsr x8, x9, #52 + lsl x9, x9, #12 + add x7, x4, x7 + adc x4, x5, x11 + ldp x11, x5, [x2, #32] + add x8, x7, x8 + and x7, x22, #0xfffffffffffff + extr x22, x8, x9, #12 + lsl x9, x15, #48 + mul x15, x6, x24 + add x10, x10, x15 + lsr x15, x8, #52 + extr x25, x11, x25, #16 + and x25, x25, #0xfffffffffffff + mul x24, x6, x7 + add x7, x10, x15 + lsr x10, x7, #52 + lsl x8, x8, #12 + extr x8, x7, x8, #24 + adds x22, x16, x22 + ldp x16, x15, [x1, #32] + adcs x23, x23, x8 + extr x8, x5, x11, #56 + mul x25, x6, x25 + add x24, x12, x24 + add x12, x24, x10 + lsr x10, x16, #4 + lsl x7, x7, #12 + extr x24, x12, x7, #36 + and x10, x10, #0xfffffffffffff + extr x26, x16, x26, #16 + mul x10, x14, x10 + and x8, x8, #0xfffffffffffff + adcs x21, x21, x24 + and x7, x26, #0xfffffffffffff + mul x7, x14, x7 + lsr x24, x11, #4 + and x24, x24, #0xfffffffffffff + extr x11, x15, x16, #56 + lsl x26, x12, #12 + and x16, x11, #0xfffffffffffff + mul x11, x6, x24 + lsr x12, x12, #52 + ldp x2, x24, [x2, #48] + add x25, x7, x25 + add x25, x25, x9 + and x9, x23, x21 + mul x8, x6, x8 + add x12, x25, x12 + add x25, x10, x11 + extr x11, x12, x26, #48 + ldp x7, x26, [x1, #48] + extr x5, x2, x5, #44 + lsr x1, x12, #52 + mul x10, x14, x16 + lsr x16, x24, #20 + add x10, x10, x8 + extr x8, x26, x7, #32 + and x8, x8, #0xfffffffffffff + extr x24, x24, x2, #32 + mul x2, x6, x16 + add x1, x25, x1 + lsr x25, x26, #20 + and x26, x24, #0xfffffffffffff + and x24, x5, #0xfffffffffffff + extr x16, x7, x15, #44 + mul x7, x6, x24 + adcs x11, x20, x11 + and x20, x16, #0xfffffffffffff + lsl x5, x12, #12 + and x15, x9, x11 + mul x24, x14, x20 + lsr x9, x1, #52 + add x20, x10, x9 + extr x12, x1, x5, #60 + lsl x9, x20, #12 + lsl x5, x12, #8 + mul x10, x14, x8 + extr x12, x20, x5, #8 + lsr x1, x20, #52 + add x7, x24, x7 + adcs x8, x19, x12 + and x5, x15, x8 + add x7, x7, x1 + mul x20, x6, x26 + extr x24, x7, x9, #20 + lsr x19, x7, #52 + mul x25, x14, x25 + lsl x16, x7, #12 + add x20, x10, x20 + adcs x12, x17, x24 + add x19, x20, x19 + lsr x26, x19, #52 + mul x24, x14, x6 + and x5, x5, x12 + add x6, x25, x2 + lsl x17, x19, #12 + add x14, x6, x26 + extr x16, x19, x16, #32 + lsr x6, x14, #44 + extr x19, x14, x17, #44 + add x9, x24, x6 + adcs x17, x3, x16 + adcs x2, x13, x19 + and x7, x5, x17 + adc x15, x4, x9 + cmp xzr, xzr + orr x1, x15, #0xfffffffffffffe00 + lsr x3, x15, #9 + adcs xzr, x22, x3 + and x15, x7, x2 + adcs xzr, x15, xzr + adcs xzr, x1, xzr + adcs x7, x22, x3 + lsl x3, x7, #9 + lsr x15, x7, #55 + str x15, [x0, #64] + adcs x13, x23, xzr + adcs x16, x21, xzr + stp x13, x16, [x0] + adcs x13, x11, xzr + adcs x16, x8, xzr + stp x13, x16, [x0, #16] + adcs x19, x12, xzr + adcs x16, x17, xzr + adcs x13, x2, xzr + stp x19, x16, [x0, #32] + adc x16, x1, xzr + and x16, x16, #0x1ff + orr x16, x16, x3 + stp x13, x16, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/p521/bignum_montsqr_p521_neon.S b/arm/p521/bignum_montsqr_p521_neon.S new file mode 100644 index 0000000000..c4d1173165 --- /dev/null +++ b/arm/p521/bignum_montsqr_p521_neon.S @@ -0,0 +1,1124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^576) mod p_521 +// Input x[9]; output z[9] +// +// extern void bignum_montsqr_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9]); +// +// Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the +// Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is +// a Mersenne prime the basic modular squaring bignum_sqr_p521 can be +// considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x16, x8, [x1] +// ldr q18, [x1] +// ldr q5, [x1] +// ldr q20, [x1] +// ldp x17, x13, [x1, #16] +// ldr q17, [x1, #16] +// ldr q1, [x1, #16] +// ldr q28, [x1, #16] +// ldp x9, x15, [x1, #32] +// ldr q27, [x1] +// ldr q29, [x1, #32] +// ldp x23, x2, [x1, #48] +// ldr q6, [x1, #48] +// ldr q4, [x1, #48] +// mul x24, x9, x23 +// mul x11, x15, x2 +// umulh x20, x9, x23 +// subs x4, x9, x15 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x2, x23 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x7, x12, cc +// eor x14, x19, x7 +// eor x22, x4, x7 +// adds x12, x24, x20 +// adc x19, x20, xzr +// umulh x4, x15, x2 +// adds x12, x12, x11 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x11 +// adc x4, x4, xzr +// cmn x7, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x7 +// adds x11, x24, x24 +// adcs x20, x12, x12 +// adcs x10, x19, x19 +// adcs x3, x4, x4 +// adc x5, xzr, xzr +// ldr q30, [x1, #32] +// umull v0.2D, v30.2S, v30.2S +// umull2 v2.2D, v30.4S, v30.4S +// xtn v24.2S, v30.2D +// uzp2 v30.4S, v30.4S, v30.4S +// umull v30.2D, v30.2S, v24.2S +// mov x7, v0.d[0] +// mov x14, v0.d[1] +// mov x19, v2.d[0] +// mov x22, v2.d[1] +// mov x4, v30.d[0] +// mov x12, v30.d[1] +// adds x21, x7, x4, lsl #33 +// lsr x4, x4, #31 +// adc x14, x14, x4 +// adds x19, x19, x12, lsl #33 +// lsr x4, x12, #31 +// adc x22, x22, x4 +// mul x4, x9, x15 +// umulh x12, x9, x15 +// adds x24, x14, x4, lsl #1 +// extr x4, x12, x4, #63 +// adcs x19, x19, x4 +// lsr x4, x12, #63 +// adc x4, x22, x4 +// adds x11, x11, x19 +// adcs x20, x20, x4 +// adcs x10, x10, xzr +// adcs x3, x3, xzr +// adc x6, x5, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v4.4S, v4.4S +// xtn v25.2S, v6.2D +// xtn v23.2S, v4.2D +// rev64 v30.4S, v4.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v6.4S, v6.4S +// mul v30.4S, v30.4S, v6.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x5, v30.d[0] +// mov x7, v30.d[1] +// mul x14, x23, x2 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x23, x2 +// adds x12, x19, x14 +// adcs x19, x7, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x7, x5, x10 +// adcs x3, x12, x3 +// adcs x14, x19, x6 +// adc x10, x4, xzr +// ldr x4, [x1, #64] +// add x6, x4, x4 +// mul x5, x4, x4 +// and x4, x16, #0xfffffffffffff +// mul x22, x6, x4 +// extr x4, x8, x16, #52 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #12 +// adds x21, x21, x4 +// extr x4, x17, x8, #40 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #24 +// adcs x24, x24, x4 +// extr x4, x13, x17, #28 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #36 +// adcs x11, x11, x4 +// extr x4, x9, x13, #16 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #48 +// adcs x20, x20, x4 +// lsr x4, x9, #4 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x22, x12, x4, #60 +// extr x4, x15, x9, #56 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x12, x19, x4 +// lsl x4, x22, #8 +// extr x4, x12, x4, #8 +// adcs x7, x7, x4 +// extr x4, x23, x15, #44 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #20 +// adcs x1, x3, x4 +// extr x4, x2, x23, #32 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #32 +// adcs x14, x14, x4 +// lsr x4, x2, #20 +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x19, x19, x4 +// lsl x4, x12, #12 +// extr x4, x19, x4, #44 +// adcs x22, x10, x4 +// lsr x4, x19, #44 +// adc x12, x5, x4 +// extr x19, x24, x21, #9 +// extr x4, x11, x24, #9 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// extr x19, x20, x11, #9 +// extr x4, x7, x20, #9 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// extr x19, x1, x7, #9 +// extr x4, x14, x1, #9 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// extr x19, x22, x14, #9 +// extr x4, x12, x22, #9 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// and x19, x21, #0x1ff +// lsr x4, x12, #9 +// add x4, x19, x4 +// str x4, [x0, #64] +// uzp1 v2.4S, v28.4S, v18.4S +// rev64 v30.4S, v28.4S +// uzp1 v24.4S, v18.4S, v18.4S +// mul v30.4S, v30.4S, v18.4S +// uaddlp v30.2D, v30.4S +// shl v30.2D, v30.2D, #32 +// umlal v30.2D, v24.2S, v2.2S +// mov x11, v30.d[0] +// mov x20, v30.d[1] +// umulh x7, x16, x17 +// subs x4, x16, x8 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x13, x17 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x1, x12, cc +// eor x14, x19, x1 +// eor x22, x4, x1 +// adds x12, x11, x7 +// adc x19, x7, xzr +// umulh x4, x8, x13 +// adds x12, x12, x20 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x20 +// adc x4, x4, xzr +// cmn x1, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x1 +// adds x21, x11, x11 +// adcs x24, x12, x12 +// adcs x11, x19, x19 +// adcs x20, x4, x4 +// adc x7, xzr, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v20.4S, v20.4S +// xtn v25.2S, v5.2D +// xtn v23.2S, v20.2D +// rev64 v30.4S, v20.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v5.4S, v5.4S +// mul v30.4S, v30.4S, v5.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x10, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x16, x8 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x16, x8 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x3, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x5, x21, x19 +// adcs x21, x24, x4 +// adcs x24, x11, xzr +// adcs x11, x20, xzr +// adc x20, x7, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v1.4S, v1.4S +// xtn v25.2S, v17.2D +// xtn v23.2S, v1.2D +// rev64 v30.4S, v1.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v17.4S, v17.4S +// mul v30.4S, v30.4S, v17.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x7, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x17, x13 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x17, x13 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x1, x7, x24 +// adcs x14, x12, x11 +// adcs x22, x19, x20 +// adc x12, x4, xzr +// ldp x19, x4, [x0] // @slothy:reads=buffer0 +// adds x19, x19, x10 +// adcs x4, x4, x3 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// ldp x19, x4, [x0, #16] // @slothy:reads=buffer16 +// adcs x19, x19, x5 +// adcs x4, x4, x21 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// ldp x19, x4, [x0, #32] // @slothy:reads=buffer32 +// adcs x19, x19, x1 +// adcs x4, x4, x14 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// ldp x19, x4, [x0, #48] // @slothy:reads=buffer48 +// adcs x19, x19, x22 +// adcs x4, x4, x12 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// ldr x4, [x0, #64] +// adc x4, x4, xzr +// str x4, [x0, #64] +// movi v3.2D, #0x00000000ffffffff +// uzp2 v2.4S, v29.4S, v29.4S +// xtn v16.2S, v27.2D +// xtn v25.2S, v29.2D +// rev64 v30.4S, v29.4S +// umull v24.2D, v16.2S, v25.2S +// umull v23.2D, v16.2S, v2.2S +// uzp2 v0.4S, v27.4S, v27.4S +// mul v30.4S, v30.4S, v27.4S +// usra v23.2D, v24.2D, #32 +// umull v2.2D, v0.2S, v2.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v23.16B, v3.16B +// umlal v24.2D, v0.2S, v25.2S +// shl v30.2D, v30.2D, #32 +// usra v2.2D, v23.2D, #32 +// umlal v30.2D, v16.2S, v25.2S +// usra v2.2D, v24.2D, #32 +// mov x6, v30.d[0] +// mov x22, v30.d[1] +// mul x12, x17, x23 +// mul x19, x13, x2 +// mov x4, v2.d[0] +// adds x22, x22, x4 +// mov x4, v2.d[1] +// adcs x12, x12, x4 +// umulh x4, x17, x23 +// adcs x19, x19, x4 +// umulh x4, x13, x2 +// adc x4, x4, xzr +// adds x21, x22, x6 +// adcs x22, x12, x22 +// adcs x12, x19, x12 +// adcs x19, x4, x19 +// adc x4, xzr, x4 +// adds x24, x22, x6 +// adcs x11, x12, x21 +// adcs x20, x19, x22 +// adcs x1, x4, x12 +// adcs x14, xzr, x19 +// adc x7, xzr, x4 +// subs x4, x17, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x23 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x1, x1, x4 +// eor x4, x12, x19 +// adcs x14, x14, x4 +// adc x7, x7, x19 +// subs x4, x16, x8 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x15, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x10, x21, x4 +// eor x4, x12, x19 +// adcs x24, x24, x4 +// adcs x11, x11, x19 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x20, x20, x4 +// eor x4, x12, x19 +// adcs x1, x1, x4 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x24, x24, x4 +// eor x4, x12, x19 +// adcs x11, x11, x4 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x11, x11, x4 +// eor x4, x12, x19 +// adcs x20, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x3, x11, x4 +// eor x4, x12, x19 +// adcs x5, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x22, x7, x19 +// ldp x12, x19, [x0] // @slothy:reads=buffer0 +// extr x4, x1, x5, #8 +// adds x11, x4, x12 +// extr x4, x14, x1, #8 +// adcs x20, x4, x19 +// ldp x19, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x4, x22, x14, #8 +// adcs x7, x4, x19 +// and x19, x20, x7 +// lsr x4, x22, #8 +// adcs x1, x4, x12 +// and x22, x19, x1 +// ldp x19, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x4, x6, #1 +// adcs x14, x4, x19 +// and x19, x22, x14 +// extr x4, x10, x6, #63 +// adcs x21, x4, x12 +// and x22, x19, x21 +// ldp x19, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x4, x24, x10, #63 +// adcs x2, x4, x19 +// and x19, x22, x2 +// extr x4, x3, x24, #63 +// adcs x24, x4, x12 +// and x12, x19, x24 +// ldr x19, [x0, #64] +// extr x4, x5, x3, #63 +// and x4, x4, #0x1ff +// adc x4, x19, x4 +// lsr x19, x4, #9 +// orr x4, x4, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x11, x19 +// adcs xzr, x12, xzr +// adcs xzr, x4, xzr +// adcs x11, x11, x19 +// adcs x20, x20, xzr +// adcs x7, x7, xzr +// adcs x1, x1, xzr +// adcs x14, x14, xzr +// adcs x22, x21, xzr +// adcs x12, x2, xzr +// adcs x24, x24, xzr +// adc x4, x4, xzr +// and x19, x4, #0x1ff +// lsl x4, x11, #9 +// extr x11, x20, x11, #55 +// extr x20, x7, x20, #55 +// extr x7, x1, x7, #55 +// extr x1, x14, x1, #55 +// orr x4, x19, x4 +// extr x14, x22, x14, #55 +// extr x22, x12, x22, #55 +// extr x12, x24, x12, #55 +// extr x19, x4, x24, #55 +// lsr x4, x4, #55 +// stp x11, x20, [x0] // @slothy:writes=buffer0 +// stp x7, x1, [x0, #16] // @slothy:writes=buffer16 +// stp x14, x22, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x19, [x0, #48] // @slothy:writes=buffer48 +// str x4, [x0, #64] +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + +// The optimized body + + ldr q31, [x1, #48] + ldp x9, x15, [x1, #32] + ldp x23, x2, [x1, #48] + ldr q0, [x1, #48] + ldr q29, [x1, #32] + rev64 v21.4S, v31.4S + umulh x13, x9, x23 + mul v23.4S, v21.4S, v0.4S + xtn v21.2S, v0.2D + uzp2 v19.4S, v31.4S, v31.4S + xtn v2.2S, v29.2D + xtn v30.2S, v31.2D + uzp2 v3.4S, v29.4S, v29.4S + umull v6.2D, v21.2S, v19.2S + mul x10, x9, x23 + uaddlp v23.2D, v23.4S + umull v22.2D, v21.2S, v30.2S + adds x22, x10, x13 + mul x17, x9, x15 + movi v25.2D, #0x00000000ffffffff + uzp2 v1.4S, v0.4S, v0.4S + adc x8, x13, xzr + subs x19, x9, x15 + umull v28.2D, v3.2S, v2.2S + shl v31.2D, v23.2D, #32 + csetm x5, cc + cneg x3, x19, cc + umull v19.2D, v1.2S, v19.2S + ldr q4, [x1, #16] + subs x24, x2, x23 + mul x6, x15, x2 + usra v6.2D, v22.2D, #32 + ldr q23, [x1] + cneg x13, x24, cc + umulh x24, x15, x2 + umull v5.2D, v29.2S, v29.2S + rev64 v3.4S, v4.4S + cinv x19, x5, cc + adds x16, x22, x6 + mov x14, v28.d[1] + umlal v31.2D, v21.2S, v30.2S + umull2 v17.2D, v29.4S, v29.4S + mov x20, v28.d[0] + mul v29.4S, v3.4S, v23.4S + and v22.16B, v6.16B, v25.16B + mul x5, x3, x13 + mov x4, v5.d[1] + mov x7, v5.d[0] + adcs x11, x8, x24 + ldr q5, [x1] + ldr q0, [x1] + adc x22, x24, xzr + adds x8, x11, x6 + usra v19.2D, v6.2D, #32 + umlal v22.2D, v1.2S, v30.2S + adc x11, x22, xzr + adds x21, x7, x20, lsl #33 + mov x24, v17.d[1] + mov x22, v17.d[0] + lsr x12, x20, #31 + uzp1 v2.4S, v4.4S, v23.4S + uzp1 v20.4S, v23.4S, v23.4S + usra v19.2D, v22.2D, #32 + adc x4, x4, x12 + lsr x6, x14, #31 + adds x20, x22, x14, lsl #33 + ldr q17, [x1, #16] + uzp2 v22.4S, v0.4S, v0.4S + eor x12, x5, x19 + umulh x7, x3, x13 + xtn v23.2S, v0.2D + adc x5, x24, x6 + cmn x19, #0x1 + xtn v25.2S, v5.2D + ldr q27, [x1] + adcs x16, x16, x12 + uaddlp v1.2D, v29.4S + umulh x3, x9, x15 + eor x13, x7, x19 + adcs x24, x8, x13 + adc x11, x11, x19 + adds x12, x10, x10 + adcs x13, x16, x16 + mul x19, x23, x2 + umull v21.2D, v25.2S, v23.2S + adcs x7, x24, x24 + ldp x16, x8, [x1] + umull v3.2D, v25.2S, v22.2S + uzp2 v6.4S, v5.4S, v5.4S + adcs x10, x11, x11 + ldr q29, [x1, #32] + adc x14, xzr, xzr + adds x24, x4, x17, lsl #1 + mov x4, v31.d[1] + shl v30.2D, v1.2D, #32 + lsr x6, x3, #63 + extr x11, x3, x17, #63 + ldr q1, [x1, #16] + mov x22, v19.d[1] + adcs x20, x20, x11 + umulh x3, x23, x2 + movi v4.2D, #0x00000000ffffffff + usra v3.2D, v21.2D, #32 + adc x5, x5, x6 + adds x11, x12, x20 + mov x6, v19.d[0] + umull v19.2D, v6.2S, v22.2S + adcs x20, x13, x5 + rev64 v22.4S, v0.4S + ldr x5, [x1, #64] + ldp x17, x13, [x1, #16] + adcs x7, x7, xzr + umlal v30.2D, v20.2S, v2.2S + adcs x12, x10, xzr + and x1, x16, #0xfffffffffffff + mul v22.4S, v22.4S, v5.4S + adc x14, x14, xzr + adds x6, x6, x19 + xtn v5.2S, v1.2D + adcs x10, x4, x3 + mov x4, v31.d[0] + adc x22, x22, xzr + adds x19, x6, x19 + add x6, x5, x5 + and v21.16B, v3.16B, v4.16B + adcs x10, x10, x3 + extr x3, x8, x16, #52 + mul x1, x6, x1 + usra v19.2D, v3.2D, #32 + adc x22, x22, xzr + adds x7, x4, x7 + umlal v21.2D, v6.2S, v23.2S + and x4, x3, #0xfffffffffffff + adcs x3, x19, x12 + uzp2 v28.4S, v1.4S, v1.4S + extr x19, x17, x8, #40 + mul x12, x6, x4 + adcs x14, x10, x14 + rev64 v4.4S, v1.4S + mul x5, x5, x5 + lsr x4, x9, #4 + adc x10, x22, xzr + lsl x22, x1, #12 + lsr x1, x1, #52 + add x12, x12, x1 + and x1, x19, #0xfffffffffffff + extr x19, x12, x22, #12 + mul x1, x6, x1 + extr x22, x13, x17, #28 + adds x21, x21, x19 + mul v31.4S, v4.4S, v17.4S + and x19, x22, #0xfffffffffffff + lsr x22, x12, #52 + lsl x12, x12, #12 + mul x19, x6, x19 + add x22, x1, x22 + extr x1, x22, x12, #24 + and x4, x4, #0xfffffffffffff + adcs x12, x24, x1 + extr x1, x9, x13, #16 + mul x24, x6, x4 + and x1, x1, #0xfffffffffffff + lsr x4, x22, #52 + add x4, x19, x4 + lsl x22, x22, #12 + mul x1, x6, x1 + extr x22, x4, x22, #36 + adcs x11, x11, x22 + extr x22, x11, x12, #9 + extr x19, x12, x21, #9 + uaddlp v3.2D, v22.4S + lsl x12, x4, #12 + stp x19, x22, [x0] + umulh x19, x16, x17 + uaddlp v31.2D, v31.4S + lsr x22, x4, #52 + extr x4, x15, x9, #56 + usra v19.2D, v21.2D, #32 + add x22, x1, x22 + extr x1, x23, x15, #44 + shl v4.2D, v31.2D, #32 + extr x12, x22, x12, #48 + and x4, x4, #0xfffffffffffff + uzp2 v7.4S, v17.4S, v17.4S + adcs x20, x20, x12 + xtn v17.2S, v17.2D + lsl x12, x22, #12 + lsr x22, x22, #52 + mul x4, x6, x4 + add x22, x24, x22 + and x24, x1, #0xfffffffffffff + extr x1, x2, x23, #32 + extr x12, x22, x12, #60 + lsl x12, x12, #8 + lsr x22, x22, #52 + mul x24, x6, x24 + add x4, x4, x22 + and x22, x1, #0xfffffffffffff + extr x12, x4, x12, #8 + lsl x1, x4, #12 + lsr x4, x4, #52 + adcs x7, x7, x12 + mul x12, x6, x22 + add x24, x24, x4 + extr x1, x24, x1, #20 + extr x22, x20, x11, #9 + extr x20, x7, x20, #9 + lsr x11, x2, #20 + mul x6, x6, x11 + lsr x4, x24, #52 + add x4, x12, x4 + lsl x12, x24, #12 + adcs x3, x3, x1 + extr x24, x4, x12, #32 + lsr x11, x4, #52 + adcs x12, x14, x24 + umull v31.2D, v17.2S, v28.2S + add x24, x6, x11 + lsl x1, x4, #12 + extr x7, x3, x7, #9 + rev64 v6.4S, v29.4S + umull v22.2D, v17.2S, v5.2S + extr x11, x12, x3, #9 + extr x14, x24, x1, #44 + umlal v4.2D, v17.2S, v5.2S + adcs x3, x10, x14 + umulh x10, x8, x13 + lsr x14, x24, #44 + adc x24, x5, x14 + subs x5, x16, x8 + stp x22, x20, [x0, #16] + csetm x1, cc + shl v21.2D, v3.2D, #32 + movi v17.2D, #0x00000000ffffffff + cneg x20, x5, cc + subs x5, x13, x17 + usra v31.2D, v22.2D, #32 + cneg x14, x5, cc + lsr x6, x24, #9 + and x22, x21, #0x1ff + mov x4, v30.d[0] + add x6, x22, x6 + stp x7, x11, [x0, #32] + umulh x22, x20, x14 + mov x5, v30.d[1] + str x6, [x0, #64] + extr x12, x3, x12, #9 + umull v28.2D, v7.2S, v28.2S + mul x11, x20, x14 + mul v6.4S, v6.4S, v27.4S + and v1.16B, v31.16B, v17.16B + cinv x21, x1, cc + adds x6, x4, x19 + uzp2 v22.4S, v27.4S, v27.4S + adc x20, x19, xzr + adds x6, x6, x5 + umlal v1.2D, v7.2S, v5.2S + xtn v20.2S, v29.2D + eor x22, x22, x21 + adcs x7, x20, x10 + usra v28.2D, v31.2D, #32 + eor x20, x11, x21 + usra v28.2D, v1.2D, #32 + xtn v0.2S, v27.2D + adc x10, x10, xzr + adds x1, x7, x5 + umlal v21.2D, v25.2S, v23.2S + uzp2 v29.4S, v29.4S, v29.4S + adc x19, x10, xzr + cmn x21, #0x1 + umull v3.2D, v0.2S, v20.2S + adcs x5, x6, x20 + extr x10, x24, x3, #9 + umull v31.2D, v0.2S, v29.2S + adcs x1, x1, x22 + stp x12, x10, [x0, #48] + mul x24, x16, x8 + mov x3, v28.d[1] + usra v31.2D, v3.2D, #32 + adc x10, x19, x21 + adds x7, x4, x4 + umulh x14, x16, x8 + uaddlp v3.2D, v6.4S + mov x4, v28.d[0] + adcs x12, x5, x5 + mov x5, v19.d[0] + movi v23.2D, #0x00000000ffffffff + adcs x20, x1, x1 + mov x19, v21.d[1] + mov x1, v19.d[1] + adcs x22, x10, x10 + and v17.16B, v31.16B, v23.16B + adc x6, xzr, xzr + umlal v17.2D, v22.2S, v20.2S + adds x10, x5, x24 + mul x11, x17, x13 + mov x5, v21.d[0] + umull v28.2D, v22.2S, v29.2S + adcs x19, x19, x14 + shl v5.2D, v3.2D, #32 + adc x21, x1, xzr + adds x10, x10, x24 + adcs x1, x19, x14 + umulh x14, x17, x13 + adc x19, x21, xzr + adds x7, x7, x1 + adcs x1, x12, x19 + adcs x24, x20, xzr + mov x20, v4.d[1] + usra v28.2D, v31.2D, #32 + mov x21, v4.d[0] + adcs x19, x22, xzr + adc x6, x6, xzr + adds x4, x4, x11 + adcs x20, x20, x14 + adc x22, x3, xzr + adds x12, x4, x11 + umulh x11, x13, x2 + adcs x3, x20, x14 + adc x20, x22, xzr + adds x21, x21, x24 + ldp x22, x24, [x0] + adcs x4, x12, x19 + ldp x19, x14, [x0, #16] + usra v28.2D, v17.2D, #32 + adcs x3, x3, x6 + umlal v5.2D, v0.2S, v20.2S + adc x6, x20, xzr + umulh x20, x17, x23 + adds x12, x22, x5 + ldp x22, x5, [x0, #32] + adcs x10, x24, x10 + adcs x19, x19, x7 + stp x12, x10, [x0] + ldp x12, x7, [x0, #48] + adcs x10, x14, x1 + mul x14, x13, x2 + ldr x24, [x0, #64] + adcs x22, x22, x21 + adcs x5, x5, x4 + mov x21, v28.d[1] + stp x22, x5, [x0, #32] + mul x1, x17, x23 + adcs x3, x12, x3 + mov x4, v28.d[0] + mov x12, v5.d[1] + stp x19, x10, [x0, #16] + adcs x19, x7, x6 + mov x6, v5.d[0] + adc x10, x24, xzr + subs x7, x16, x8 + cneg x5, x7, cc + csetm x24, cc + subs x7, x15, x9 + cneg x22, x7, cc + cinv x7, x24, cc + adds x12, x12, x4 + umulh x4, x5, x22 + adcs x1, x1, x21 + stp x3, x19, [x0, #48] + str x10, [x0, #64] + adcs x20, x14, x20 + adc x21, x11, xzr + subs x14, x17, x13 + cneg x10, x14, cc + csetm x3, cc + subs x19, x2, x23 + cneg x19, x19, cc + cinv x11, x3, cc + adds x14, x12, x6 + mul x24, x5, x22 + adcs x22, x1, x12 + eor x3, x4, x7 + mul x4, x10, x19 + adcs x1, x20, x1 + adcs x12, x21, x20 + adc x5, xzr, x21 + umulh x19, x10, x19 + adds x20, x22, x6 + eor x10, x24, x7 + adcs x21, x1, x14 + eor x24, x4, x11 + adcs x4, x12, x22 + adcs x1, x5, x1 + adcs x12, xzr, x12 + adc x22, xzr, x5 + eor x5, x19, x11 + cmn x11, #0x1 + adcs x19, x1, x24 + adcs x5, x12, x5 + adc x24, x22, x11 + subs x1, x8, x13 + cneg x22, x1, cc + csetm x1, cc + subs x11, x2, x15 + cinv x1, x1, cc + cneg x12, x11, cc + cmn x7, #0x1 + adcs x10, x14, x10 + mul x14, x22, x12 + adcs x20, x20, x3 + eor x11, x14, x1 + adcs x3, x21, x7 + umulh x21, x22, x12 + adcs x22, x4, x7 + adcs x4, x19, x7 + adcs x12, x5, x7 + adc x7, x24, x7 + subs x14, x16, x17 + csetm x5, cc + cneg x19, x14, cc + subs x24, x23, x9 + cneg x14, x24, cc + cinv x5, x5, cc + cmn x1, #0x1 + mul x24, x19, x14 + adcs x22, x22, x11 + eor x11, x21, x1 + eor x24, x24, x5 + umulh x19, x19, x14 + adcs x4, x4, x11 + adcs x14, x12, x1 + adc x1, x7, x1 + subs x17, x8, x17 + cneg x12, x17, cc + csetm x17, cc + subs x16, x16, x13 + cneg x11, x16, cc + csetm x16, cc + subs x23, x23, x15 + cinv x7, x17, cc + cneg x13, x23, cc + mul x15, x12, x13 + subs x23, x2, x9 + cinv x8, x16, cc + cneg x17, x23, cc + eor x16, x19, x5 + mul x23, x11, x17 + cmn x5, #0x1 + adcs x20, x20, x24 + eor x15, x15, x7 + adcs x3, x3, x16 + adcs x2, x22, x5 + umulh x16, x11, x17 + adcs x19, x4, x5 + ldp x4, x22, [x0, #48] + extr x21, x10, x6, #63 + adcs x24, x14, x5 + eor x23, x23, x8 + adc x1, x1, x5 + cmn x8, #0x1 + umulh x9, x12, x13 + eor x14, x16, x8 + adcs x3, x3, x23 + ldp x11, x5, [x0, #16] + ldp x13, x16, [x0] + adcs x23, x2, x14 + adcs x14, x19, x8 + extr x19, x20, x10, #63 + lsl x12, x6, #1 + adcs x17, x24, x8 + adc x1, x1, x8 + cmn x7, #0x1 + adcs x24, x3, x15 + eor x9, x9, x7 + ldp x15, x3, [x0, #32] + adcs x9, x23, x9 + ldr x8, [x0, #64] + extr x20, x24, x20, #63 + adcs x23, x14, x7 + extr x2, x9, x24, #63 + adcs x14, x17, x7 + and x24, x2, #0x1ff + extr x9, x23, x9, #8 + extr x6, x14, x23, #8 + adc x23, x1, x7 + adds x10, x9, x13 + adcs x13, x6, x16 + extr x1, x23, x14, #8 + lsr x23, x23, #8 + adcs x7, x1, x11 + adcs x2, x23, x5 + and x23, x13, x7 + adcs x16, x12, x15 + and x23, x23, x2 + adcs x14, x21, x3 + and x23, x23, x16 + adcs x5, x19, x4 + and x23, x23, x14 + adcs x22, x20, x22 + and x23, x23, x5 + and x1, x23, x22 + adc x9, x8, x24 + lsr x23, x9, #9 + cmp xzr, xzr + orr x17, x9, #0xfffffffffffffe00 + adcs xzr, x10, x23 + adcs xzr, x1, xzr + adcs xzr, x17, xzr + adcs x23, x10, x23 + adcs x9, x13, xzr + lsl x4, x23, #9 + adcs x1, x7, xzr + extr x23, x9, x23, #55 + extr x9, x1, x9, #55 + adcs x10, x2, xzr + extr x1, x10, x1, #55 + stp x23, x9, [x0] + adcs x19, x16, xzr + adcs x9, x14, xzr + extr x23, x19, x10, #55 + adcs x10, x5, xzr + stp x1, x23, [x0, #16] + extr x5, x9, x19, #55 + adcs x1, x22, xzr + extr x23, x10, x9, #55 + adc x9, x17, xzr + stp x5, x23, [x0, #32] + extr x10, x1, x10, #55 + and x23, x9, #0x1ff + orr x23, x23, x4 + extr x9, x23, x1, #55 + lsr x23, x23, #55 + stp x10, x9, [x0, #48] + str x23, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From 46c80a466b5d394d475619fa551806ec582f373c Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Thu, 20 Jun 2024 11:03:38 -0500 Subject: [PATCH 24/24] Add `bignum_{sqr,mul}_p521_neon` This adds `bignum_{sqr,mul}_p521_neon` and their proofs. The new subroutine specs are added to specification.txt, and test as well as benchmark are updated. Benchmark results on GV2 are: ``` bignum_mul_p521 : 135.1 ns each (var 0.2%, corr -0.01) = 7404184 ops/sec bignum_mul_p521_neon : 115.5 ns each (var 0.3%, corr 0.00) = 8660108 ops/sec bignum_sqr_p521 : 108.9 ns each (var 0.2%, corr 0.08) = 9184994 ops/sec bignum_sqr_p521_neon : 78.7 ns each (var 0.3%, corr 0.06) = 12708368 ops/sec ``` s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/02df8e4d8f57473d0f1d40def0ac14f04fc59e52 --- arm/p521/Makefile | 2 + arm/p521/bignum_montsqr_p521_neon.S | 2 +- arm/p521/bignum_mul_p521_neon.S | 1402 +++++++++++++++++++++++++++ arm/p521/bignum_sqr_p521_neon.S | 1121 +++++++++++++++++++++ 4 files changed, 2526 insertions(+), 1 deletion(-) create mode 100644 arm/p521/bignum_mul_p521_neon.S create mode 100644 arm/p521/bignum_sqr_p521_neon.S diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 64db072532..3e5e0e855c 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -38,10 +38,12 @@ OBJ = bignum_add_p521.o \ bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ + bignum_mul_p521_neon.o \ bignum_neg_p521.o \ bignum_optneg_p521.o \ bignum_sqr_p521.o \ bignum_sqr_p521_alt.o \ + bignum_sqr_p521_neon.o \ bignum_sub_p521.o \ bignum_tolebytes_p521.o \ bignum_tomont_p521.o \ diff --git a/arm/p521/bignum_montsqr_p521_neon.S b/arm/p521/bignum_montsqr_p521_neon.S index c4d1173165..57cf911615 100644 --- a/arm/p521/bignum_montsqr_p521_neon.S +++ b/arm/p521/bignum_montsqr_p521_neon.S @@ -562,7 +562,7 @@ // The bash script used for step 2 is as follows: // // # Store the assembly instructions except the last 'ret', -// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// # callee-register store/loads as, say, 'input.S'. // export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" // export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" // /tools/external/slothy.sh input.S my_out_dir diff --git a/arm/p521/bignum_mul_p521_neon.S b/arm/p521/bignum_mul_p521_neon.S new file mode 100644 index 0000000000..c9d34151d5 --- /dev/null +++ b/arm/p521/bignum_mul_p521_neon.S @@ -0,0 +1,1402 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_mul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_mul_p521_neon is functionally equivalent to bignum_mul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x15, x21, [x1] +// ldp x10, x17, [x1, #16] +// ldp x13, x16, [x2] +// ldr q18, [x1] +// ldr q28, [x2] +// ldp x5, x20, [x2, #16] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v27.4S, v28.4S +// umull v21.2D, v4.2S, v1.2S +// umull v28.2D, v4.2S, v7.2S +// uzp2 v5.4S, v18.4S, v18.4S +// mul v18.4S, v27.4S, v18.4S +// usra v28.2D, v21.2D, #32 +// umull v29.2D, v5.2S, v7.2S +// uaddlp v18.2D, v18.4S +// and v16.16B, v28.16B, v16.16B +// umlal v16.2D, v5.2S, v1.2S +// shl v18.2D, v18.2D, #32 +// usra v29.2D, v28.2D, #32 +// umlal v18.2D, v4.2S, v1.2S +// usra v29.2D, v16.2D, #32 +// mov x8, v18.d[0] +// mov x9, v18.d[1] +// mul x6, x10, x5 +// mul x19, x17, x20 +// mov x14, v29.d[0] +// adds x9, x9, x14 +// mov x14, v29.d[1] +// adcs x6, x6, x14 +// umulh x14, x10, x5 +// adcs x19, x19, x14 +// umulh x14, x17, x20 +// adc x14, x14, xzr +// adds x11, x9, x8 +// adcs x9, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adc x14, xzr, x14 +// adds x3, x9, x8 +// adcs x24, x6, x11 +// adcs x9, x19, x9 +// adcs x6, x14, x6 +// adcs x19, xzr, x19 +// adc x14, xzr, x14 +// subs x4, x10, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x5 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x6, x6, x23 +// eor x4, x4, x7 +// adcs x19, x19, x4 +// adc x14, x14, x7 +// subs x4, x15, x21 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x16, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x11, x11, x23 +// eor x4, x4, x7 +// adcs x3, x3, x4 +// adcs x24, x24, x7 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x21, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x16 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x9, x9, x23 +// eor x4, x4, x7 +// adcs x6, x6, x4 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x15, x10 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x5, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x3, x3, x23 +// eor x4, x4, x7 +// adcs x24, x24, x4 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x17, x15, x17 +// cneg x17, x17, cc +// csetm x4, cc +// subs x13, x20, x13 +// cneg x13, x13, cc +// mul x20, x17, x13 +// umulh x17, x17, x13 +// cinv x13, x4, cc +// cmn x13, #0x1 +// eor x20, x20, x13 +// adcs x20, x24, x20 +// eor x17, x17, x13 +// adcs x17, x9, x17 +// adcs x9, x6, x13 +// adcs x6, x19, x13 +// adc x13, x14, x13 +// subs x21, x21, x10 +// cneg x21, x21, cc +// csetm x10, cc +// subs x16, x5, x16 +// cneg x16, x16, cc +// mul x5, x21, x16 +// umulh x21, x21, x16 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x16, x5, x10 +// adcs x16, x20, x16 +// eor x21, x21, x10 +// adcs x21, x17, x21 +// adcs x17, x9, x10 +// adcs x5, x6, x10 +// adc x10, x13, x10 +// lsl x13, x8, #9 +// extr x20, x11, x8, #55 +// extr x8, x3, x11, #55 +// extr x9, x16, x3, #55 +// lsr x16, x16, #55 +// stp x21, x17, [sp] // @slothy:writes=stack0 +// stp x5, x10, [sp, #16] // @slothy:writes=stack16 +// stp x13, x20, [sp, #32] // @slothy:writes=stack32 +// stp x8, x9, [sp, #48] // @slothy:writes=stack48 +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x21, x10, [x1, #32] +// ldp x17, x13, [x1, #48] +// ldp x16, x5, [x2, #32] +// ldr q18, [x1, #32] +// ldr q28, [x2, #32] +// ldp x20, x8, [x2, #48] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v28.4S, v28.4S +// umull v27.2D, v4.2S, v1.2S +// umull v29.2D, v4.2S, v7.2S +// uzp2 v21.4S, v18.4S, v18.4S +// mul v28.4S, v28.4S, v18.4S +// usra v29.2D, v27.2D, #32 +// umull v18.2D, v21.2S, v7.2S +// uaddlp v28.2D, v28.4S +// and v16.16B, v29.16B, v16.16B +// umlal v16.2D, v21.2S, v1.2S +// shl v28.2D, v28.2D, #32 +// usra v18.2D, v29.2D, #32 +// umlal v28.2D, v4.2S, v1.2S +// usra v18.2D, v16.2D, #32 +// mov x9, v28.d[0] +// mov x6, v28.d[1] +// mul x19, x17, x20 +// mul x14, x13, x8 +// mov x11, v18.d[0] +// adds x6, x6, x11 +// mov x11, v18.d[1] +// adcs x19, x19, x11 +// umulh x11, x17, x20 +// adcs x14, x14, x11 +// umulh x11, x13, x8 +// adc x11, x11, xzr +// adds x3, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adc x11, xzr, x11 +// adds x24, x6, x9 +// adcs x4, x19, x3 +// adcs x6, x14, x6 +// adcs x19, x11, x19 +// adcs x14, xzr, x14 +// adc x11, xzr, x11 +// subs x7, x17, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x20 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x19, x19, x22 +// eor x7, x7, x23 +// adcs x14, x14, x7 +// adc x11, x11, x23 +// subs x7, x21, x10 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x5, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x3, x3, x22 +// eor x7, x7, x23 +// adcs x24, x24, x7 +// adcs x4, x4, x23 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x6, x6, x22 +// eor x7, x7, x23 +// adcs x19, x19, x7 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x24, x24, x22 +// eor x7, x7, x23 +// adcs x4, x4, x7 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// ldp x7, x23, [sp] // @slothy:reads=stack0 +// adds x9, x9, x7 +// adcs x3, x3, x23 +// stp x9, x3, [sp] // @slothy:writes=stack0 +// ldp x9, x3, [sp, #16] // @slothy:reads=stack16 +// adcs x9, x24, x9 +// adcs x3, x4, x3 +// stp x9, x3, [sp, #16] // @slothy:writes=stack16 +// ldp x9, x3, [sp, #32] // @slothy:reads=stack32 +// adcs x9, x6, x9 +// adcs x6, x19, x3 +// stp x9, x6, [sp, #32] // @slothy:writes=stack32 +// ldp x9, x6, [sp, #48] // @slothy:reads=stack48 +// adcs x9, x14, x9 +// adcs x6, x11, x6 +// stp x9, x6, [sp, #48] // @slothy:writes=stack48 +// ldr x9, [sp, #64] // @slothy:reads=stack64 +// adc x9, x9, xzr +// str x9, [sp, #64] // @slothy:writes=stack64 +// ldp x9, x6, [x1] +// subs x21, x21, x9 +// sbcs x10, x10, x6 +// ldp x9, x6, [x1, #16] +// sbcs x17, x17, x9 +// sbcs x13, x13, x6 +// csetm x9, cc +// ldp x6, x19, [x2] +// subs x16, x6, x16 +// sbcs x5, x19, x5 +// ldp x6, x19, [x2, #16] +// sbcs x20, x6, x20 +// sbcs x8, x19, x8 +// csetm x6, cc +// eor x21, x21, x9 +// subs x21, x21, x9 +// eor x10, x10, x9 +// sbcs x10, x10, x9 +// eor x17, x17, x9 +// sbcs x17, x17, x9 +// eor x13, x13, x9 +// sbc x13, x13, x9 +// eor x16, x16, x6 +// subs x16, x16, x6 +// eor x5, x5, x6 +// sbcs x5, x5, x6 +// eor x20, x20, x6 +// sbcs x20, x20, x6 +// eor x8, x8, x6 +// sbc x8, x8, x6 +// eor x9, x6, x9 +// mul x6, x21, x16 +// mul x19, x10, x5 +// mul x14, x17, x20 +// mul x11, x13, x8 +// umulh x3, x21, x16 +// adds x19, x19, x3 +// umulh x3, x10, x5 +// adcs x14, x14, x3 +// umulh x3, x17, x20 +// adcs x11, x11, x3 +// umulh x3, x13, x8 +// adc x3, x3, xzr +// adds x24, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adcs x11, x3, x11 +// adc x3, xzr, x3 +// adds x4, x19, x6 +// adcs x7, x14, x24 +// adcs x19, x11, x19 +// adcs x14, x3, x14 +// adcs x11, xzr, x11 +// adc x3, xzr, x3 +// subs x23, x17, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x20 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x14, x14, x12 +// eor x23, x23, x22 +// adcs x11, x11, x23 +// adc x3, x3, x22 +// subs x23, x21, x10 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x5, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x24, x24, x12 +// eor x23, x23, x22 +// adcs x4, x4, x23 +// adcs x7, x7, x22 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x10, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x5 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x19, x19, x12 +// eor x23, x23, x22 +// adcs x14, x14, x23 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x21, x17 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x20, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x4, x4, x12 +// eor x23, x23, x22 +// adcs x7, x7, x23 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x21, x21, x13 +// cneg x21, x21, cc +// csetm x13, cc +// subs x16, x8, x16 +// cneg x16, x16, cc +// mul x8, x21, x16 +// umulh x21, x21, x16 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x16, x8, x13 +// adcs x16, x7, x16 +// eor x21, x21, x13 +// adcs x21, x19, x21 +// adcs x8, x14, x13 +// adcs x19, x11, x13 +// adc x13, x3, x13 +// subs x10, x10, x17 +// cneg x10, x10, cc +// csetm x17, cc +// subs x5, x20, x5 +// cneg x5, x5, cc +// mul x20, x10, x5 +// umulh x10, x10, x5 +// cinv x17, x17, cc +// cmn x17, #0x1 +// eor x5, x20, x17 +// adcs x16, x16, x5 +// eor x10, x10, x17 +// adcs x21, x21, x10 +// adcs x10, x8, x17 +// adcs x5, x19, x17 +// adc x17, x13, x17 +// ldp x13, x20, [sp] // @slothy:reads=stack0 +// ldp x8, x19, [sp, #16] // @slothy:reads=stack16 +// eor x6, x6, x9 +// adds x6, x6, x13 +// eor x14, x24, x9 +// adcs x14, x14, x20 +// eor x11, x4, x9 +// adcs x11, x11, x8 +// eor x16, x16, x9 +// adcs x16, x16, x19 +// eor x21, x21, x9 +// ldp x3, x24, [sp, #32] // @slothy:reads=stack32 +// ldp x4, x7, [sp, #48] // @slothy:reads=stack48 +// ldr x23, [sp, #64] // @slothy:reads=stack64 +// adcs x21, x21, x3 +// eor x10, x10, x9 +// adcs x10, x10, x24 +// eor x5, x5, x9 +// adcs x5, x5, x4 +// eor x17, x17, x9 +// adcs x17, x17, x7 +// adc x22, x23, xzr +// adds x21, x21, x13 +// adcs x10, x10, x20 +// adcs x13, x5, x8 +// adcs x17, x17, x19 +// and x5, x9, #0x1ff +// lsl x20, x6, #9 +// orr x5, x20, x5 +// adcs x5, x3, x5 +// extr x20, x14, x6, #55 +// adcs x20, x24, x20 +// extr x8, x11, x14, #55 +// adcs x8, x4, x8 +// extr x9, x16, x11, #55 +// adcs x9, x7, x9 +// lsr x16, x16, #55 +// adc x16, x16, x23 +// ldr x6, [x2, #64] +// ldp x19, x14, [x1] +// and x11, x19, #0xfffffffffffff +// mul x11, x6, x11 +// ldr x3, [x1, #64] +// ldp x24, x4, [x2] +// and x7, x24, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// extr x19, x14, x19, #52 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x24, x4, x24, #52 +// and x24, x24, #0xfffffffffffff +// mul x24, x3, x24 +// add x19, x19, x24 +// lsr x24, x11, #52 +// add x19, x19, x24 +// lsl x11, x11, #12 +// extr x11, x19, x11, #12 +// adds x21, x21, x11 +// ldp x11, x24, [x1, #16] +// ldp x7, x23, [x2, #16] +// extr x14, x11, x14, #40 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x4, x7, x4, #40 +// and x4, x4, #0xfffffffffffff +// mul x4, x3, x4 +// add x14, x14, x4 +// lsr x4, x19, #52 +// add x14, x14, x4 +// lsl x19, x19, #12 +// extr x19, x14, x19, #24 +// adcs x10, x10, x19 +// extr x19, x24, x11, #28 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x11, x23, x7, #28 +// and x11, x11, #0xfffffffffffff +// mul x11, x3, x11 +// add x19, x19, x11 +// lsr x11, x14, #52 +// add x19, x19, x11 +// lsl x14, x14, #12 +// extr x14, x19, x14, #36 +// adcs x13, x13, x14 +// and x14, x10, x13 +// ldp x11, x4, [x1, #32] +// ldp x7, x12, [x2, #32] +// extr x24, x11, x24, #16 +// and x24, x24, #0xfffffffffffff +// mul x24, x6, x24 +// extr x23, x7, x23, #16 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x24, x24, x23 +// lsl x23, x22, #48 +// add x24, x24, x23 +// lsr x23, x19, #52 +// add x24, x24, x23 +// lsl x19, x19, #12 +// extr x19, x24, x19, #48 +// adcs x17, x17, x19 +// and x19, x14, x17 +// lsr x14, x11, #4 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// lsr x23, x7, #4 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x14, x14, x23 +// lsr x23, x24, #52 +// add x14, x14, x23 +// lsl x24, x24, #12 +// extr x24, x14, x24, #60 +// extr x11, x4, x11, #56 +// and x11, x11, #0xfffffffffffff +// mul x11, x6, x11 +// extr x7, x12, x7, #56 +// and x7, x7, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// lsr x14, x14, #52 +// add x14, x11, x14 +// lsl x11, x24, #8 +// extr x11, x14, x11, #8 +// adcs x5, x5, x11 +// and x19, x19, x5 +// ldp x11, x24, [x1, #48] +// ldp x2, x7, [x2, #48] +// extr x4, x11, x4, #44 +// and x4, x4, #0xfffffffffffff +// mul x4, x6, x4 +// extr x23, x2, x12, #44 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x4, x4, x23 +// lsr x23, x14, #52 +// add x4, x4, x23 +// lsl x14, x14, #12 +// extr x14, x4, x14, #20 +// adcs x20, x20, x14 +// and x19, x19, x20 +// extr x14, x24, x11, #32 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x2, x7, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x3, x2 +// add x2, x14, x2 +// lsr x14, x4, #52 +// add x2, x2, x14 +// lsl x14, x4, #12 +// extr x14, x2, x14, #32 +// adcs x8, x8, x14 +// and x19, x19, x8 +// lsr x14, x24, #20 +// mul x14, x6, x14 +// lsr x11, x7, #20 +// mul x11, x3, x11 +// add x14, x14, x11 +// lsr x11, x2, #52 +// add x14, x14, x11 +// lsl x2, x2, #12 +// extr x2, x14, x2, #44 +// adcs x9, x9, x2 +// and x2, x19, x9 +// mul x6, x6, x3 +// lsr x19, x14, #44 +// add x6, x6, x19 +// adc x16, x16, x6 +// lsr x6, x16, #9 +// orr x16, x16, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x21, x6 +// adcs xzr, x2, xzr +// adcs xzr, x16, xzr +// adcs x21, x21, x6 +// adcs x10, x10, xzr +// adcs x13, x13, xzr +// adcs x17, x17, xzr +// adcs x5, x5, xzr +// adcs x20, x20, xzr +// adcs x8, x8, xzr +// adcs x9, x9, xzr +// adc x16, x16, xzr +// and x2, x21, #0x1ff +// extr x21, x10, x21, #9 +// extr x10, x13, x10, #9 +// stp x21, x10, [x0] // @slothy:writes=buffer0 +// extr x21, x17, x13, #9 +// extr x10, x5, x17, #9 +// stp x21, x10, [x0, #16] // @slothy:writes=buffer16 +// extr x21, x20, x5, #9 +// extr x10, x8, x20, #9 +// stp x21, x10, [x0, #32] // @slothy:writes=buffer32 +// extr x21, x9, x8, #9 +// extr x10, x16, x9, #9 +// stp x21, x10, [x0, #48] // @slothy:writes=buffer48 +// str x2, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_mul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/p521/bignum_sqr_p521_neon.S b/arm/p521/bignum_sqr_p521_neon.S new file mode 100644 index 0000000000..13cd1c2541 --- /dev/null +++ b/arm/p521/bignum_sqr_p521_neon.S @@ -0,0 +1,1121 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced +// Input x[9]; output z[9] +// +// extern void bignum_sqr_p521_neon (uint64_t z[static 9], +// uint64_t x[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x20, x19, [x1] +// ldr q23, [x1] +// ldr q1, [x1] +// ldr q16, [x1] +// ldp x14, x12, [x1, #16] +// ldr q28, [x1, #16] +// ldr q31, [x1, #16] +// ldp x9, x2, [x1, #32] +// ldr q29, [x1, #32] +// ldr q4, [x1, #32] +// ldr q5, [x1] +// ldr q2, [x1, #32] +// ldp x6, x13, [x1, #48] +// ldr q24, [x1, #48] +// ldr q27, [x1, #48] +// ldr q0, [x1, #16] +// ldr q30, [x1, #48] +// mul x17, x9, x6 +// mul x10, x2, x13 +// umulh x24, x9, x6 +// subs x4, x9, x2 +// cneg x4, x4, cc +// csetm x16, cc +// subs x3, x13, x6 +// cneg x23, x3, cc +// mul x3, x4, x23 +// umulh x4, x4, x23 +// cinv x22, x16, cc +// eor x23, x3, x22 +// eor x16, x4, x22 +// adds x3, x17, x24 +// adc x24, x24, xzr +// umulh x4, x2, x13 +// adds x3, x3, x10 +// adcs x24, x24, x4 +// adc x4, x4, xzr +// adds x24, x24, x10 +// adc x10, x4, xzr +// cmn x22, #0x1 +// adcs x4, x3, x23 +// adcs x24, x24, x16 +// adc x10, x10, x22 +// adds x8, x17, x17 +// adcs x22, x4, x4 +// adcs x5, x24, x24 +// adcs x11, x10, x10 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v4.4S, v4.4S +// xtn v26.2S, v29.2D +// xtn v22.2S, v4.2D +// rev64 v4.4S, v4.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v29.4S, v29.4S +// mul v4.4S, v4.4S, v29.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x15, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x9, x2 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x9, x2 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x7, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x8, x8, x10 +// adcs x22, x22, x17 +// adcs x21, x5, xzr +// adcs x5, x11, xzr +// adc x11, x23, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v27.4S, v27.4S +// xtn v26.2S, v24.2D +// xtn v22.2S, v27.2D +// rev64 v4.4S, v27.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v24.4S, v24.4S +// mul v4.4S, v4.4S, v24.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x6, x13 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x6, x13 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x23, x23, x21 +// adcs x16, x24, x5 +// adcs x3, x10, x11 +// adc x21, x17, xzr +// ldr x17, [x1, #64] +// add x5, x17, x17 +// mul x11, x17, x17 +// and x17, x20, #0xfffffffffffff +// mul x4, x5, x17 +// extr x17, x19, x20, #52 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #12 +// adds x15, x15, x17 +// extr x17, x14, x19, #40 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #24 +// adcs x7, x7, x17 +// extr x17, x12, x14, #28 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #36 +// adcs x8, x8, x17 +// extr x17, x9, x12, #16 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #48 +// adcs x22, x22, x17 +// lsr x17, x9, #4 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x4, x24, x17, #60 +// extr x17, x2, x9, #56 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x24, x10, x17 +// lsl x17, x4, #8 +// extr x17, x24, x17, #8 +// adcs x23, x23, x17 +// extr x17, x6, x2, #44 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #20 +// adcs x16, x16, x17 +// extr x17, x13, x6, #32 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #32 +// adcs x3, x3, x17 +// lsr x17, x13, #20 +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x10, x10, x17 +// lsl x17, x24, #12 +// extr x17, x10, x17, #44 +// adcs x4, x21, x17 +// lsr x17, x10, #44 +// adc x24, x11, x17 +// extr x10, x7, x15, #9 +// extr x17, x8, x7, #9 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// extr x10, x22, x8, #9 +// extr x17, x23, x22, #9 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// extr x10, x16, x23, #9 +// extr x17, x3, x16, #9 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// extr x10, x4, x3, #9 +// extr x17, x24, x4, #9 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// and x10, x15, #0x1ff +// lsr x17, x24, #9 +// add x17, x10, x17 +// str x17, [x0, #64] // @slothy:writes=buffer64 +// uzp1 v17.4S, v28.4S, v23.4S +// rev64 v4.4S, v28.4S +// uzp1 v7.4S, v23.4S, v23.4S +// mul v4.4S, v4.4S, v23.4S +// uaddlp v4.2D, v4.4S +// shl v4.2D, v4.2D, #32 +// umlal v4.2D, v7.2S, v17.2S +// mov x8, v4.d[0] +// mov x22, v4.d[1] +// umulh x23, x20, x14 +// subs x17, x20, x19 +// cneg x4, x17, cc +// csetm x24, cc +// subs x17, x12, x14 +// cneg x17, x17, cc +// mul x10, x4, x17 +// umulh x17, x4, x17 +// cinv x16, x24, cc +// eor x3, x10, x16 +// eor x4, x17, x16 +// adds x24, x8, x23 +// adc x10, x23, xzr +// umulh x17, x19, x12 +// adds x24, x24, x22 +// adcs x10, x10, x17 +// adc x17, x17, xzr +// adds x10, x10, x22 +// adc x17, x17, xzr +// cmn x16, #0x1 +// adcs x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, x16 +// adds x15, x8, x8 +// adcs x7, x24, x24 +// adcs x8, x10, x10 +// adcs x22, x17, x17 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v16.4S, v16.4S +// xtn v26.2S, v1.2D +// xtn v22.2S, v16.2D +// rev64 v4.4S, v16.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v1.4S, v1.4S +// mul v4.4S, v4.4S, v1.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x21, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x20, x19 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x20, x19 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x5, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x11, x15, x10 +// adcs x15, x7, x17 +// adcs x7, x8, xzr +// adcs x8, x22, xzr +// adc x22, x23, xzr +// xtn v7.2S, v31.2D +// shrn v4.2S, v31.2D, #32 +// umull v4.2D, v7.2S, v4.2S +// shl v4.2D, v4.2D, #33 +// umlal v4.2D, v7.2S, v7.2S +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x14, x12 +// umulh x10, x14, x14 +// umulh x17, x12, x12 +// umulh x4, x14, x12 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x16, x23, x7 +// adcs x3, x24, x8 +// adcs x4, x10, x22 +// adc x24, x17, xzr +// ldp x10, x17, [x0] // @slothy:reads=buffer0 +// adds x10, x10, x21 +// adcs x17, x17, x5 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// ldp x10, x17, [x0, #16] // @slothy:reads=buffer16 +// adcs x10, x10, x11 +// adcs x17, x17, x15 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// ldp x10, x17, [x0, #32] // @slothy:reads=buffer32 +// adcs x10, x10, x16 +// adcs x17, x17, x3 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// ldp x10, x17, [x0, #48] // @slothy:reads=buffer48 +// adcs x10, x10, x4 +// adcs x17, x17, x24 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// ldr x17, [x0, #64] // @slothy:reads=buffer64 +// adc x17, x17, xzr +// str x17, [x0, #64] // @slothy:writes=buffer64 +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v2.4S, v2.4S +// xtn v26.2S, v5.2D +// xtn v22.2S, v2.2D +// rev64 v4.4S, v2.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v5.4S, v5.4S +// mul v4.4S, v4.4S, v5.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x5, v4.d[0] +// mov x4, v4.d[1] +// movi v25.2D, #0xffffffff +// uzp2 v17.4S, v30.4S, v30.4S +// xtn v19.2S, v0.2D +// xtn v26.2S, v30.2D +// rev64 v4.4S, v30.4S +// umull v7.2D, v19.2S, v26.2S +// umull v22.2D, v19.2S, v17.2S +// uzp2 v21.4S, v0.4S, v0.4S +// mul v4.4S, v4.4S, v0.4S +// usra v22.2D, v7.2D, #32 +// umull v17.2D, v21.2S, v17.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v22.16B, v25.16B +// umlal v7.2D, v21.2S, v26.2S +// shl v4.2D, v4.2D, #32 +// usra v17.2D, v22.2D, #32 +// umlal v4.2D, v19.2S, v26.2S +// usra v17.2D, v7.2D, #32 +// mov x24, v4.d[0] +// mov x10, v4.d[1] +// mov x17, v18.d[0] +// adds x4, x4, x17 +// mov x17, v18.d[1] +// adcs x24, x24, x17 +// mov x17, v17.d[0] +// adcs x10, x10, x17 +// mov x17, v17.d[1] +// adc x17, x17, xzr +// adds x15, x4, x5 +// adcs x4, x24, x4 +// adcs x24, x10, x24 +// adcs x10, x17, x10 +// adc x17, xzr, x17 +// adds x7, x4, x5 +// adcs x8, x24, x15 +// adcs x22, x10, x4 +// adcs x23, x17, x24 +// adcs x16, xzr, x10 +// adc x3, xzr, x17 +// subs x17, x14, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x6 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x23, x23, x17 +// eor x17, x24, x10 +// adcs x16, x16, x17 +// adc x3, x3, x10 +// subs x17, x20, x19 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x2, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x11, x15, x17 +// eor x17, x24, x10 +// adcs x15, x7, x17 +// adcs x7, x8, x10 +// adcs x22, x22, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x19, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x2 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x8, x22, x17 +// eor x17, x24, x10 +// adcs x23, x23, x17 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x20, x14 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x6, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x22, x15, x17 +// eor x17, x24, x10 +// adcs x4, x7, x17 +// adcs x24, x8, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x12, x20, x12 +// cneg x10, x12, cc +// csetm x17, cc +// subs x12, x13, x9 +// cneg x9, x12, cc +// mul x12, x10, x9 +// umulh x13, x10, x9 +// cinv x9, x17, cc +// cmn x9, #0x1 +// eor x12, x12, x9 +// adcs x4, x4, x12 +// eor x12, x13, x9 +// adcs x24, x24, x12 +// adcs x10, x23, x9 +// adcs x17, x16, x9 +// adc x13, x3, x9 +// subs x19, x19, x14 +// cneg x12, x19, cc +// csetm x9, cc +// subs x6, x6, x2 +// cneg x14, x6, cc +// mul x19, x12, x14 +// umulh x12, x12, x14 +// cinv x14, x9, cc +// cmn x14, #0x1 +// eor x19, x19, x14 +// adcs x23, x4, x19 +// eor x19, x12, x14 +// adcs x16, x24, x19 +// adcs x6, x10, x14 +// adcs x2, x17, x14 +// adc x9, x13, x14 +// ldp x12, x14, [x0] // @slothy:reads=buffer0 +// extr x19, x6, x16, #8 +// adds x10, x19, x12 +// extr x19, x2, x6, #8 +// adcs x17, x19, x14 +// ldp x14, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x19, x9, x2, #8 +// adcs x13, x19, x14 +// and x14, x17, x13 +// lsr x19, x9, #8 +// adcs x6, x19, x12 +// and x9, x14, x6 +// ldp x14, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x19, x5, #1 +// adcs x2, x19, x14 +// and x14, x9, x2 +// extr x19, x11, x5, #63 +// adcs x3, x19, x12 +// and x9, x14, x3 +// ldp x14, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x19, x22, x11, #63 +// adcs x4, x19, x14 +// and x14, x9, x4 +// extr x19, x23, x22, #63 +// adcs x24, x19, x12 +// and x12, x14, x24 +// ldr x14, [x0, #64] // @slothy:reads=buffer64 +// extr x19, x16, x23, #63 +// and x19, x19, #0x1ff +// adc x19, x14, x19 +// lsr x14, x19, #9 +// orr x19, x19, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x10, x14 +// adcs xzr, x12, xzr +// adcs xzr, x19, xzr +// adcs x10, x10, x14 +// adcs x17, x17, xzr +// adcs x13, x13, xzr +// adcs x6, x6, xzr +// adcs x2, x2, xzr +// adcs x9, x3, xzr +// adcs x12, x4, xzr +// adcs x14, x24, xzr +// adc x19, x19, xzr +// and x19, x19, #0x1ff +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// stp x13, x6, [x0, #16] // @slothy:writes=buffer16 +// stp x2, x9, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x14, [x0, #48] // @slothy:writes=buffer48 +// str x19, [x0, #64] // @slothy:writes=buffer64 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_sqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif