diff --git a/arm/p256/bignum_montinv_p256.S b/arm/p256/bignum_montinv_p256.S index 059f77e9af5..1a5a7a0ffc4 100644 --- a/arm/p256/bignum_montinv_p256.S +++ b/arm/p256/bignum_montinv_p256.S @@ -820,9 +820,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256): mov i, #10 mov d, #1 - b midloop + b bignum_montinv_p256_midloop -loop: +bignum_montinv_p256_loop: // Separate the matrix elements into sign-magnitude pairs @@ -1137,7 +1137,7 @@ loop: stp x1, x3, [v] stp x2, x5, [v+16] -midloop: +bignum_montinv_p256_midloop: mov x1, d ldr x2, [f] @@ -1148,7 +1148,7 @@ midloop: // Next iteration subs i, i, #1 - bne loop + bne bignum_montinv_p256_loop // The 10th and last iteration does not need anything except the // u value and the sign of f; the latter can be obtained from the diff --git a/arm/p256/p256_montjscalarmul.S b/arm/p256/p256_montjscalarmul.S index 23bc20971e8..246421ff37d 100644 --- a/arm/p256/p256_montjscalarmul.S +++ b/arm/p256/p256_montjscalarmul.S @@ -56,6 +56,31 @@ #define NSPACE #(31*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(1*I); \ + ldp x12, x13, [x15]; \ + csel x0, x12, x0, eq; \ + csel x1, x13, x1, eq; \ + ldp x12, x13, [x15, #16]; \ + csel x2, x12, x2, eq; \ + csel x3, x13, x3, eq; \ + ldp x12, x13, [x15, #32]; \ + csel x4, x12, x4, eq; \ + csel x5, x13, x5, eq; \ + ldp x12, x13, [x15, #48]; \ + csel x6, x12, x6, eq; \ + csel x7, x13, x7, eq; \ + ldp x12, x13, [x15, #64]; \ + csel x8, x12, x8, eq; \ + csel x9, x13, x9, eq; \ + ldp x12, x13, [x15, #80]; \ + csel x10, x12, x10, eq; \ + csel x11, x13, x11, eq; \ + add x15, x15, #96 + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul): add x0, tab+96*1 add x1, tab - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*2 add x1, tab+96*1 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*3 add x1, tab+96*1 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*4 add x1, tab+96*3 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*5 add x1, tab+96*2 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*6 add x1, tab+96*5 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*7 add x1, tab+96*3 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble // Initialize the accumulator as a table entry for top 4 bits (unrecoded) @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul): mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul): // Main loop over size-4 bitfields: double 4 times then add signed digit -loop: +p256_montjscalarmul_mainloop: sub j, j, #4 add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble lsr x2, j, #6 ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly @@ -299,30 +309,15 @@ loop: mov x10, xzr mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -357,9 +352,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd - cbnz j, loop + cbnz j, p256_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -386,7 +381,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_p256_montjadd: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -3506,7 +3501,7 @@ local_p256_montjadd: ldp x19, x20, [sp], #16 ret -local_p256_montjdouble: +p256_montjscalarmul_p256_montjdouble: sub sp, sp, #0x110 stp x19, x20, [sp, #192] stp x21, x22, [sp, #208] diff --git a/arm/p256/p256_montjscalarmul_alt.S b/arm/p256/p256_montjscalarmul_alt.S index 0e453f5bae2..8ac5806a725 100644 --- a/arm/p256/p256_montjscalarmul_alt.S +++ b/arm/p256/p256_montjscalarmul_alt.S @@ -56,6 +56,31 @@ #define NSPACE #(31*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(1*I); \ + ldp x12, x13, [x15]; \ + csel x0, x12, x0, eq; \ + csel x1, x13, x1, eq; \ + ldp x12, x13, [x15, #16]; \ + csel x2, x12, x2, eq; \ + csel x3, x13, x3, eq; \ + ldp x12, x13, [x15, #32]; \ + csel x4, x12, x4, eq; \ + csel x5, x13, x5, eq; \ + ldp x12, x13, [x15, #48]; \ + csel x6, x12, x6, eq; \ + csel x7, x13, x7, eq; \ + ldp x12, x13, [x15, #64]; \ + csel x8, x12, x8, eq; \ + csel x9, x13, x9, eq; \ + ldp x12, x13, [x15, #80]; \ + csel x10, x12, x10, eq; \ + csel x11, x13, x11, eq; \ + add x15, x15, #96 + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): add x0, tab+96*1 add x1, tab - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*2 add x1, tab+96*1 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*3 add x1, tab+96*1 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*4 add x1, tab+96*3 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*5 add x1, tab+96*2 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*6 add x1, tab+96*5 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*7 add x1, tab+96*3 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble // Initialize the accumulator as a table entry for top 4 bits (unrecoded) @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): // Main loop over size-4 bitfields: double 4 times then add signed digit -loop: +p256_montjscalarmul_alt_mainloop: sub j, j, #4 add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble lsr x2, j, #6 ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly @@ -299,30 +309,15 @@ loop: mov x10, xzr mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -357,9 +352,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd - cbnz j, loop + cbnz j, p256_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -386,7 +381,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_alt_p256_montjadd: sub sp, sp, #0xe0 mov x15, x0 mov x16, x1 @@ -2316,7 +2311,7 @@ local_p256_montjadd: add sp, sp, #0xe0 ret -local_p256_montjdouble: +p256_montjscalarmul_alt_p256_montjdouble: sub sp, sp, #0xc0 mov x15, x0 mov x16, x1 diff --git a/arm/p384/Makefile b/arm/p384/Makefile index 138a0da5c77..5d64426750c 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -28,11 +28,13 @@ OBJ = bignum_add_p384.o \ bignum_demont_p384.o \ bignum_double_p384.o \ bignum_half_p384.o \ + bignum_inv_p384.o \ bignum_littleendian_6.o \ bignum_mod_n384.o \ bignum_mod_n384_6.o \ bignum_mod_p384.o \ bignum_mod_p384_6.o \ + bignum_montinv_p384.o \ bignum_montmul_p384.o \ bignum_montmul_p384_alt.o \ bignum_montmul_p384_neon.o \ diff --git a/arm/p384/bignum_inv_p384.S b/arm/p384/bignum_inv_p384.S new file mode 100644 index 00000000000..085224172ea --- /dev/null +++ b/arm/p384/bignum_inv_p384.S @@ -0,0 +1,1469 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that +// x does not need to be reduced modulo p_384, but the output always is. +// If the input is divisible (i.e. is 0 or p_384), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f sp, #0 +#define g sp, #(8*N) +#define u sp, #(16*N) +#define v sp, #(24*N) + +// Total size to reserve on the stack + +#define NSPACE #(32*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding +// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary +// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the +// result fits in 6 digits but is not necessarily strictly reduced mod p_384. +// --------------------------------------------------------------------------- + +#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + mov t1, #0xe000000000000000; \ + adds d0, d0, t1; \ + mov t2, #0x000000001fffffff; \ + adcs d1, d1, t2; \ + mov t3, #0xffffffffe0000000; \ + bic t3, t3, #0x2000000000000000; \ + adcs d2, d2, t3; \ + sbcs d3, d3, xzr; \ + sbcs d4, d4, xzr; \ + sbcs d5, d5, xzr; \ + mov t1, #0x1fffffffffffffff; \ + adc d6, d6, t1; \ +/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ +/* Store it back into d0 since we no longer need that digit. */ \ + add d0, d0, d0, lsl #32; \ +/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ +/* We know the lowest word will cancel d0 so we don't need it */ \ + mov t1, #0xffffffff00000001; \ + umulh t1, t1, d0; \ + mov t2, #0x00000000ffffffff; \ + mul t3, t2, d0; \ + umulh t2, t2, d0; \ + adds t1, t1, t3; \ + adcs t2, t2, d0; \ + cset t3, cs; \ +/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ +/* We catch the net top carry from add-subtract in the digit d0 */ \ + adds d6, d6, d0; \ + cset d0, cs; \ + subs d1, d1, t1; \ + sbcs d2, d2, t2; \ + sbcs d3, d3, t3; \ + sbcs d4, d4, xzr; \ + sbcs d5, d5, xzr; \ + sbcs d6, d6, xzr; \ + sbcs d0, d0, xzr; \ +/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ + neg d0, d0; \ + and t1, d0, #0x00000000ffffffff; \ + and t2, d0, #0xffffffff00000000; \ + and t3, d0, #0xfffffffffffffffe; \ + subs d1, d1, t1; \ + sbcs d2, d2, t2; \ + sbcs d3, d3, t3; \ + sbcs d4, d4, d0; \ + sbcs d5, d5, d0; \ + sbc d6, d6, d0 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x8, x4, #0x100, lsl #12; \ + sbfx x8, x8, #21, #21; \ + mov x11, #0x100000; \ + add x11, x11, x11, lsl #21; \ + add x9, x4, x11; \ + asr x9, x9, #42; \ + add x10, x5, #0x100, lsl #12; \ + sbfx x10, x10, #21, #21; \ + add x11, x5, x11; \ + asr x11, x11, #42; \ + mul x6, x8, x2; \ + mul x7, x9, x3; \ + mul x2, x10, x2; \ + mul x3, x11, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #21, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #42; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #21, #21; \ + add x15, x5, x15; \ + asr x15, x15, #42; \ + mul x6, x12, x2; \ + mul x7, x13, x3; \ + mul x2, x14, x2; \ + mul x3, x15, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x6, x14, x8; \ + mul x7, x14, x9; \ + madd x8, x13, x10, x2; \ + madd x9, x13, x11, x3; \ + madd x16, x15, x10, x6; \ + madd x17, x15, x11, x7; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #22, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #43; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #22, #21; \ + add x15, x5, x15; \ + asr x15, x15, #43; \ + mneg x2, x12, x8; \ + mneg x3, x12, x9; \ + mneg x4, x14, x8; \ + mneg x5, x14, x9; \ + msub m00, x13, x16, x2; \ + msub m01, x13, x17, x3; \ + msub m10, x15, x16, x4; \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_inv_p384): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0x00000000ffffffff + mov x11, #0xffffffff00000000 + mov x12, #0xfffffffffffffffe + mov x15, #0xffffffffffffffff + stp x10, x11, [f] + stp x12, x15, [f+2*N] + stp x15, x15, [f+4*N] + str xzr, [f+6*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, x12 + sbcs x13, x5, x15 + ldp x6, x7, [x1, #(4*N)] + sbcs x14, x6, x15 + sbcs x15, x7, x15 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + csel x6, x6, x14, cc + csel x7, x7, x15, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + stp x6, x7, [g+4*N] + str xzr, [g+6*N] + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + stp xzr, xzr, [u+4*N] + + mov x10, #2048 + stp xzr, x10, [v] + stp xzr, xzr, [v+2*N] + stp xzr, xzr, [v+4*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + mov i, #15 + mov d, #1 + b midloop + +loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digits 5 and 6 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + ldr x23, [f+6*N] + eor x2, x23, s00 + and x2, x2, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + ldr x24, [g+6*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + extr x4, x2, x4, #59 + str x4, [f+5*N] + asr x2, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + eor x4, x23, s10 + and x4, x4, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x4, x4, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + extr x5, x4, x5, #59 + str x5, [g+5*N] + asr x4, x4, #59 + str x4, [g+6*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digits 5 and 6 of [u,v] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + + eor x1, x7, s10 + and x4, s10, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x4, x4, x1 + str x4, [v+6*N] + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldp x2, x3, [u+16] + ldp x4, x5, [u+32] + ldr x6, [u+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [u] + stp x3, x4, [u+16] + stp x5, x6, [u+32] + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + ldp x4, x5, [v+32] + ldr x6, [v+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [v] + stp x3, x4, [v+16] + stp x5, x6, [v+32] + +midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digits 5 and 6 of [u] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_384 + + ldp x10, x0, [u] + ldp x1, x2, [u+16] + ldp x3, x4, [u+32] + ldr x5, [u+48] + amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) + + mov x10, #0x00000000ffffffff + subs x10, x0, x10 + mov x11, #0xffffffff00000000 + sbcs x11, x1, x11 + mov x12, #0xfffffffffffffffe + sbcs x12, x2, x12 + mov x15, #0xffffffffffffffff + sbcs x13, x3, x15 + sbcs x14, x4, x15 + sbcs x15, x5, x15 + + csel x0, x0, x10, cc + csel x1, x1, x11, cc + csel x2, x2, x12, cc + csel x3, x3, x13, cc + csel x4, x4, x14, cc + csel x5, x5, x15, cc + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p384/bignum_montinv_p384.S b/arm/p384/bignum_montinv_p384.S new file mode 100644 index 00000000000..79d59781196 --- /dev/null +++ b/arm/p384/bignum_montinv_p384.S @@ -0,0 +1,1487 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z +// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function +// gives the analog of the modular inverse bignum_inv_p384 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_384, but the output always is. If the input +// is divisible (i.e. is 0 or p_384), then there can be no solution to +// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f sp, #0 +#define g sp, #(8*N) +#define u sp, #(16*N) +#define v sp, #(24*N) + +// Total size to reserve on the stack + +#define NSPACE #(32*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding +// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary +// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the +// result fits in 6 digits but is not necessarily strictly reduced mod p_384. +// --------------------------------------------------------------------------- + +#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + mov t1, #0xe000000000000000; \ + adds d0, d0, t1; \ + mov t2, #0x000000001fffffff; \ + adcs d1, d1, t2; \ + mov t3, #0xffffffffe0000000; \ + bic t3, t3, #0x2000000000000000; \ + adcs d2, d2, t3; \ + sbcs d3, d3, xzr; \ + sbcs d4, d4, xzr; \ + sbcs d5, d5, xzr; \ + mov t1, #0x1fffffffffffffff; \ + adc d6, d6, t1; \ +/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ +/* Store it back into d0 since we no longer need that digit. */ \ + add d0, d0, d0, lsl #32; \ +/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ +/* We know the lowest word will cancel d0 so we don't need it */ \ + mov t1, #0xffffffff00000001; \ + umulh t1, t1, d0; \ + mov t2, #0x00000000ffffffff; \ + mul t3, t2, d0; \ + umulh t2, t2, d0; \ + adds t1, t1, t3; \ + adcs t2, t2, d0; \ + cset t3, cs; \ +/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ +/* We catch the net top carry from add-subtract in the digit d0 */ \ + adds d6, d6, d0; \ + cset d0, cs; \ + subs d1, d1, t1; \ + sbcs d2, d2, t2; \ + sbcs d3, d3, t3; \ + sbcs d4, d4, xzr; \ + sbcs d5, d5, xzr; \ + sbcs d6, d6, xzr; \ + sbcs d0, d0, xzr; \ +/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ + neg d0, d0; \ + and t1, d0, #0x00000000ffffffff; \ + and t2, d0, #0xffffffff00000000; \ + and t3, d0, #0xfffffffffffffffe; \ + subs d1, d1, t1; \ + sbcs d2, d2, t2; \ + sbcs d3, d3, t3; \ + sbcs d4, d4, d0; \ + sbcs d5, d5, d0; \ + sbc d6, d6, d0 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x8, x4, #0x100, lsl #12; \ + sbfx x8, x8, #21, #21; \ + mov x11, #0x100000; \ + add x11, x11, x11, lsl #21; \ + add x9, x4, x11; \ + asr x9, x9, #42; \ + add x10, x5, #0x100, lsl #12; \ + sbfx x10, x10, #21, #21; \ + add x11, x5, x11; \ + asr x11, x11, #42; \ + mul x6, x8, x2; \ + mul x7, x9, x3; \ + mul x2, x10, x2; \ + mul x3, x11, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #21, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #42; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #21, #21; \ + add x15, x5, x15; \ + asr x15, x15, #42; \ + mul x6, x12, x2; \ + mul x7, x13, x3; \ + mul x2, x14, x2; \ + mul x3, x15, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x6, x14, x8; \ + mul x7, x14, x9; \ + madd x8, x13, x10, x2; \ + madd x9, x13, x11, x3; \ + madd x16, x15, x10, x6; \ + madd x17, x15, x11, x7; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #22, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #43; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #22, #21; \ + add x15, x5, x15; \ + asr x15, x15, #43; \ + mneg x2, x12, x8; \ + mneg x3, x12, x9; \ + mneg x4, x14, x8; \ + mneg x5, x14, x9; \ + msub m00, x13, x16, x2; \ + msub m01, x13, x17, x3; \ + msub m10, x15, x16, x4; \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_montinv_p384): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0x00000000ffffffff + mov x11, #0xffffffff00000000 + mov x12, #0xfffffffffffffffe + mov x15, #0xffffffffffffffff + stp x10, x11, [f] + stp x12, x15, [f+2*N] + stp x15, x15, [f+4*N] + str xzr, [f+6*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, x12 + sbcs x13, x5, x15 + ldp x6, x7, [x1, #(4*N)] + sbcs x14, x6, x15 + sbcs x15, x7, x15 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + csel x6, x6, x14, cc + csel x7, x7, x15, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + stp x6, x7, [g+4*N] + str xzr, [g+6*N] + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 15th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e. +// x * u == 2^768 as required. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + stp xzr, xzr, [u+4*N] + +// The starting constant 2^843 mod p_384 is +// 0x0000000000000800:00001000000007ff:fffff00000000000 +// :00001000000007ff:fffff00000000800:0000000000000000 +// where colons separate 64-bit subwords, least significant at the right. +// Not all of these are single loads on ARM so this is a bit dynamic + + mov x12, #0xfffff00000000000 + orr x10, x12, #0x0000000000000800 + stp xzr, x10, [v] + mov x11, #0x00000000000007ff + orr x11, x11, #0x0000100000000000 + stp x11, x12, [v+2*N] + mov x12, #0x0000000000000800 + stp x11, x12, [v+4*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + mov i, #15 + mov d, #1 + b bignum_montinv_p384_midloop + +bignum_montinv_p384_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digits 5 and 6 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + ldr x23, [f+6*N] + eor x2, x23, s00 + and x2, x2, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + ldr x24, [g+6*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + extr x4, x2, x4, #59 + str x4, [f+5*N] + asr x2, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + eor x4, x23, s10 + and x4, x4, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x4, x4, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + extr x5, x4, x5, #59 + str x5, [g+5*N] + asr x4, x4, #59 + str x4, [g+6*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digits 5 and 6 of [u,v] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + + eor x1, x7, s10 + and x4, s10, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x4, x4, x1 + str x4, [v+6*N] + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldp x2, x3, [u+16] + ldp x4, x5, [u+32] + ldr x6, [u+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [u] + stp x3, x4, [u+16] + stp x5, x6, [u+32] + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + ldp x4, x5, [v+32] + ldr x6, [v+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [v] + stp x3, x4, [v+16] + stp x5, x6, [v+32] + +bignum_montinv_p384_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_montinv_p384_loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digits 5 and 6 of [u] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_384 + + ldp x10, x0, [u] + ldp x1, x2, [u+16] + ldp x3, x4, [u+32] + ldr x5, [u+48] + amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) + + mov x10, #0x00000000ffffffff + subs x10, x0, x10 + mov x11, #0xffffffff00000000 + sbcs x11, x1, x11 + mov x12, #0xfffffffffffffffe + sbcs x12, x2, x12 + mov x15, #0xffffffffffffffff + sbcs x13, x3, x15 + sbcs x14, x4, x15 + sbcs x15, x5, x15 + + csel x0, x0, x10, cc + csel x1, x1, x11, cc + csel x2, x2, x12, cc + csel x3, x3, x13, cc + csel x4, x4, x14, cc + csel x5, x5, x15, cc + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/arm/p384/p384_montjscalarmul.S b/arm/p384/p384_montjscalarmul.S index f2413477d3e..f73128c1441 100644 --- a/arm/p384/p384_montjscalarmul.S +++ b/arm/p384/p384_montjscalarmul.S @@ -57,6 +57,40 @@ #define NSPACE #(55*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp bf, #(1*I); \ + ldp x20, x21, [x19]; \ + csel x0, x20, x0, eq; \ + csel x1, x21, x1, eq; \ + ldp x20, x21, [x19, #16]; \ + csel x2, x20, x2, eq; \ + csel x3, x21, x3, eq; \ + ldp x20, x21, [x19, #32]; \ + csel x4, x20, x4, eq; \ + csel x5, x21, x5, eq; \ + ldp x20, x21, [x19, #48]; \ + csel x6, x20, x6, eq; \ + csel x7, x21, x7, eq; \ + ldp x20, x21, [x19, #64]; \ + csel x8, x20, x8, eq; \ + csel x9, x21, x9, eq; \ + ldp x20, x21, [x19, #80]; \ + csel x10, x20, x10, eq; \ + csel x11, x21, x11, eq; \ + ldp x20, x21, [x19, #96]; \ + csel x12, x20, x12, eq; \ + csel x13, x21, x13, eq; \ + ldp x20, x21, [x19, #112]; \ + csel x14, x20, x14, eq; \ + csel x15, x21, x15, eq; \ + ldp x20, x21, [x19, #128]; \ + csel x16, x20, x16, eq; \ + csel x17, x21, x17, eq; \ + add x19, x19, #JACSIZE + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -77,7 +111,7 @@ S2N_BN_SYMBOL(p384_montjscalarmul): mov res, x0 -// Reduce the input scalar mod n_256, i.e. conditionally subtract n_256. +// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. // Store it to "scalarb". ldp x3, x4, [x1] @@ -132,70 +166,70 @@ S2N_BN_SYMBOL(p384_montjscalarmul): add x0, tab+JACSIZE*1 add x1, tab - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*2 add x1, tab+JACSIZE*1 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*3 add x1, tab+JACSIZE*1 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*4 add x1, tab+JACSIZE*3 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*5 add x1, tab+JACSIZE*2 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*6 add x1, tab+JACSIZE*5 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*7 add x1, tab+JACSIZE*3 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*8 add x1, tab+JACSIZE*7 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*9 add x1, tab+JACSIZE*4 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*10 add x1, tab+JACSIZE*9 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*11 add x1, tab+JACSIZE*5 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*12 add x1, tab+JACSIZE*11 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*13 add x1, tab+JACSIZE*6 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, tab+JACSIZE*14 add x1, tab+JACSIZE*13 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd add x0, tab+JACSIZE*15 add x1, tab+JACSIZE*7 - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble // Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed // digits. The digits of the constant, in lowest-to-highest order, are as @@ -263,39 +297,23 @@ S2N_BN_SYMBOL(p384_montjscalarmul): add x19, tab - .set i, 1 -.rep 16 - cmp bf, #i - ldp x20, x21, [x19] - csel x0, x20, x0, eq - csel x1, x21, x1, eq - ldp x20, x21, [x19, #16] - csel x2, x20, x2, eq - csel x3, x21, x3, eq - ldp x20, x21, [x19, #32] - csel x4, x20, x4, eq - csel x5, x21, x5, eq - ldp x20, x21, [x19, #48] - csel x6, x20, x6, eq - csel x7, x21, x7, eq - ldp x20, x21, [x19, #64] - csel x8, x20, x8, eq - csel x9, x21, x9, eq - ldp x20, x21, [x19, #80] - csel x10, x20, x10, eq - csel x11, x21, x11, eq - ldp x20, x21, [x19, #96] - csel x12, x20, x12, eq - csel x13, x21, x13, eq - ldp x20, x21, [x19, #112] - csel x14, x20, x14, eq - csel x15, x21, x15, eq - ldp x20, x21, [x19, #128] - csel x16, x20, x16, eq - csel x17, x21, x17, eq - add x19, x19, #JACSIZE - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -312,28 +330,28 @@ S2N_BN_SYMBOL(p384_montjscalarmul): // At each stage we shift the scalar left by 5 bits so we can simply pick // the top 5 bits as the bitfield, saving some fiddle over indexing. -loop: +p384_montjscalarmul_mainloop: sub j, j, #5 add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_p384_montjdouble // Choose the bitfield and adjust it to sign and magnitude @@ -377,40 +395,23 @@ loop: mov x17, xzr add x19, tab - .set i, 1 -.rep 16 - cmp bf, #i - ldp x20, x21, [x19] - csel x0, x20, x0, eq - csel x1, x21, x1, eq - ldp x20, x21, [x19, #16] - csel x2, x20, x2, eq - csel x3, x21, x3, eq - ldp x20, x21, [x19, #32] - csel x4, x20, x4, eq - csel x5, x21, x5, eq - ldp x20, x21, [x19, #48] - csel x6, x20, x6, eq - csel x7, x21, x7, eq - ldp x20, x21, [x19, #64] - csel x8, x20, x8, eq - csel x9, x21, x9, eq - ldp x20, x21, [x19, #80] - csel x10, x20, x10, eq - csel x11, x21, x11, eq - ldp x20, x21, [x19, #96] - csel x12, x20, x12, eq - csel x13, x21, x13, eq - ldp x20, x21, [x19, #112] - csel x14, x20, x14, eq - csel x15, x21, x15, eq - ldp x20, x21, [x19, #128] - csel x16, x20, x16, eq - csel x17, x21, x17, eq - add x19, x19, #JACSIZE - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) // Store it to "tabent" with the y coordinate optionally negated. // This is done carefully to give coordinates < p_384 even in @@ -459,9 +460,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p384_montjadd + bl p384_montjscalarmul_p384_montjadd - cbnz j, loop + cbnz j, p384_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -496,7 +497,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p384_montjadd: +p384_montjscalarmul_p384_montjadd: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -6352,7 +6353,7 @@ local_p384_montjadd: ldp x19, x20, [sp], #16 ret -local_p384_montjdouble: +p384_montjscalarmul_p384_montjdouble: sub sp, sp, #400 stp x19, x20, [sp, #336] stp x21, x22, [sp, #352] diff --git a/arm/p384/p384_montjscalarmul_alt.S b/arm/p384/p384_montjscalarmul_alt.S index 6e2dd56a8c3..9f47090a8c0 100644 --- a/arm/p384/p384_montjscalarmul_alt.S +++ b/arm/p384/p384_montjscalarmul_alt.S @@ -57,6 +57,40 @@ #define NSPACE #(55*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp bf, #(1*I); \ + ldp x20, x21, [x19]; \ + csel x0, x20, x0, eq; \ + csel x1, x21, x1, eq; \ + ldp x20, x21, [x19, #16]; \ + csel x2, x20, x2, eq; \ + csel x3, x21, x3, eq; \ + ldp x20, x21, [x19, #32]; \ + csel x4, x20, x4, eq; \ + csel x5, x21, x5, eq; \ + ldp x20, x21, [x19, #48]; \ + csel x6, x20, x6, eq; \ + csel x7, x21, x7, eq; \ + ldp x20, x21, [x19, #64]; \ + csel x8, x20, x8, eq; \ + csel x9, x21, x9, eq; \ + ldp x20, x21, [x19, #80]; \ + csel x10, x20, x10, eq; \ + csel x11, x21, x11, eq; \ + ldp x20, x21, [x19, #96]; \ + csel x12, x20, x12, eq; \ + csel x13, x21, x13, eq; \ + ldp x20, x21, [x19, #112]; \ + csel x14, x20, x14, eq; \ + csel x15, x21, x15, eq; \ + ldp x20, x21, [x19, #128]; \ + csel x16, x20, x16, eq; \ + csel x17, x21, x17, eq; \ + add x19, x19, #JACSIZE + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -77,7 +111,7 @@ S2N_BN_SYMBOL(p384_montjscalarmul_alt): mov res, x0 -// Reduce the input scalar mod n_256, i.e. conditionally subtract n_256. +// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. // Store it to "scalarb". ldp x3, x4, [x1] @@ -132,70 +166,70 @@ S2N_BN_SYMBOL(p384_montjscalarmul_alt): add x0, tab+JACSIZE*1 add x1, tab - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*2 add x1, tab+JACSIZE*1 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*3 add x1, tab+JACSIZE*1 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*4 add x1, tab+JACSIZE*3 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*5 add x1, tab+JACSIZE*2 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*6 add x1, tab+JACSIZE*5 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*7 add x1, tab+JACSIZE*3 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*8 add x1, tab+JACSIZE*7 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*9 add x1, tab+JACSIZE*4 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*10 add x1, tab+JACSIZE*9 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*11 add x1, tab+JACSIZE*5 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*12 add x1, tab+JACSIZE*11 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*13 add x1, tab+JACSIZE*6 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, tab+JACSIZE*14 add x1, tab+JACSIZE*13 add x2, tab - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd add x0, tab+JACSIZE*15 add x1, tab+JACSIZE*7 - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble // Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed // digits. The digits of the constant, in lowest-to-highest order, are as @@ -263,39 +297,23 @@ S2N_BN_SYMBOL(p384_montjscalarmul_alt): add x19, tab - .set i, 1 -.rep 16 - cmp bf, #i - ldp x20, x21, [x19] - csel x0, x20, x0, eq - csel x1, x21, x1, eq - ldp x20, x21, [x19, #16] - csel x2, x20, x2, eq - csel x3, x21, x3, eq - ldp x20, x21, [x19, #32] - csel x4, x20, x4, eq - csel x5, x21, x5, eq - ldp x20, x21, [x19, #48] - csel x6, x20, x6, eq - csel x7, x21, x7, eq - ldp x20, x21, [x19, #64] - csel x8, x20, x8, eq - csel x9, x21, x9, eq - ldp x20, x21, [x19, #80] - csel x10, x20, x10, eq - csel x11, x21, x11, eq - ldp x20, x21, [x19, #96] - csel x12, x20, x12, eq - csel x13, x21, x13, eq - ldp x20, x21, [x19, #112] - csel x14, x20, x14, eq - csel x15, x21, x15, eq - ldp x20, x21, [x19, #128] - csel x16, x20, x16, eq - csel x17, x21, x17, eq - add x19, x19, #JACSIZE - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -312,28 +330,28 @@ S2N_BN_SYMBOL(p384_montjscalarmul_alt): // At each stage we shift the scalar left by 5 bits so we can simply pick // the top 5 bits as the bitfield, saving some fiddle over indexing. -loop: +p384_montjscalarmul_alt_mainloop: sub j, j, #5 add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble add x0, acc add x1, acc - bl local_p384_montjdouble + bl p384_montjscalarmul_alt_p384_montjdouble // Choose the bitfield and adjust it to sign and magnitude @@ -377,40 +395,23 @@ loop: mov x17, xzr add x19, tab - .set i, 1 -.rep 16 - cmp bf, #i - ldp x20, x21, [x19] - csel x0, x20, x0, eq - csel x1, x21, x1, eq - ldp x20, x21, [x19, #16] - csel x2, x20, x2, eq - csel x3, x21, x3, eq - ldp x20, x21, [x19, #32] - csel x4, x20, x4, eq - csel x5, x21, x5, eq - ldp x20, x21, [x19, #48] - csel x6, x20, x6, eq - csel x7, x21, x7, eq - ldp x20, x21, [x19, #64] - csel x8, x20, x8, eq - csel x9, x21, x9, eq - ldp x20, x21, [x19, #80] - csel x10, x20, x10, eq - csel x11, x21, x11, eq - ldp x20, x21, [x19, #96] - csel x12, x20, x12, eq - csel x13, x21, x13, eq - ldp x20, x21, [x19, #112] - csel x14, x20, x14, eq - csel x15, x21, x15, eq - ldp x20, x21, [x19, #128] - csel x16, x20, x16, eq - csel x17, x21, x17, eq - add x19, x19, #JACSIZE - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) // Store it to "tabent" with the y coordinate optionally negated. // This is done carefully to give coordinates < p_384 even in @@ -459,9 +460,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p384_montjadd + bl p384_montjscalarmul_alt_p384_montjadd - cbnz j, loop + cbnz j, p384_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -496,7 +497,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p384_montjadd: +p384_montjscalarmul_alt_p384_montjadd: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -4882,7 +4883,7 @@ local_p384_montjadd: ldp x19, x20, [sp], #16 ret -local_p384_montjdouble: +p384_montjscalarmul_alt_p384_montjdouble: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! diff --git a/x86_att/p256/bignum_montinv_p256.S b/x86_att/p256/bignum_montinv_p256.S index 1ae2eabe65a..36f5d376e0c 100644 --- a/x86_att/p256/bignum_montinv_p256.S +++ b/x86_att/p256/bignum_montinv_p256.S @@ -1116,9 +1116,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256): movq $10, i movq $1, d - jmp midloop + jmp bignum_montinv_p256_midloop -loop: +bignum_montinv_p256_loop: // Separate out the matrix into sign-magnitude pairs @@ -1447,7 +1447,7 @@ loop: amontred(v) -midloop: +bignum_montinv_p256_midloop: divstep59(d,ff,gg) movq %rsi, d @@ -1455,7 +1455,7 @@ midloop: // Next iteration decq i - jnz loop + jnz bignum_montinv_p256_loop // The 10th and last iteration does not need anything except the // u value and the sign of f; the latter can be obtained from the diff --git a/x86_att/p256/p256_montjscalarmul.S b/x86_att/p256/p256_montjscalarmul.S index 1a36a4c784c..4569646cd31 100644 --- a/x86_att/p256/p256_montjscalarmul.S +++ b/x86_att/p256/p256_montjscalarmul.S @@ -57,6 +57,24 @@ #define NSPACE (32*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + S2N_BN_SYMBOL(p256_montjscalarmul): // The Windows version literally calls the standard ABI version. @@ -207,34 +225,34 @@ p256_montjscalarmul_standard: leaq TAB+96*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*2(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*3(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*4(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*5(%rsp), %rdi leaq TAB+96*2(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*6(%rsp), %rdi leaq TAB+96*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*7(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble // Set up accumulator as table entry for top 4 bits (constant-time indexing) @@ -254,24 +272,15 @@ p256_montjscalarmul_standard: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -289,24 +298,24 @@ p256_montjscalarmul_standard: movl $252, %ebp -loop: +p256_montjscalarmul_mainloop: subq $4, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble movq %rbp, %rax shrq $6, %rax @@ -333,24 +342,14 @@ loop: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -397,10 +396,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd testq %rbp, %rbp - jne loop + jne p256_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -446,7 +445,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_p256_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -2428,7 +2427,7 @@ local_p256_montjadd: popq %rbx ret -local_p256_montjdouble: +p256_montjscalarmul_p256_montjdouble: pushq %rbx pushq %rbp pushq %r12 diff --git a/x86_att/p256/p256_montjscalarmul_alt.S b/x86_att/p256/p256_montjscalarmul_alt.S index 51d55eee862..b68d857e76b 100644 --- a/x86_att/p256/p256_montjscalarmul_alt.S +++ b/x86_att/p256/p256_montjscalarmul_alt.S @@ -57,6 +57,24 @@ #define NSPACE (32*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + S2N_BN_SYMBOL(p256_montjscalarmul_alt): // The Windows version literally calls the standard ABI version. @@ -207,34 +225,34 @@ p256_montjscalarmul_alt_standard: leaq TAB+96*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*2(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*3(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*4(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*5(%rsp), %rdi leaq TAB+96*2(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*6(%rsp), %rdi leaq TAB+96*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*7(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble // Set up accumulator as table entry for top 4 bits (constant-time indexing) @@ -254,24 +272,15 @@ p256_montjscalarmul_alt_standard: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -289,24 +298,24 @@ p256_montjscalarmul_alt_standard: movl $252, %ebp -loop: +p256_montjscalarmul_alt_mainloop: subq $4, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble movq %rbp, %rax shrq $6, %rax @@ -333,24 +342,14 @@ loop: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -397,10 +396,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd testq %rbp, %rbp - jne loop + jne p256_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -446,7 +445,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_alt_p256_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -3218,7 +3217,7 @@ local_p256_montjadd: popq %rbx ret -local_p256_montjdouble: +p256_montjscalarmul_alt_p256_montjdouble: pushq %rbx pushq %r12 pushq %r13 diff --git a/x86_att/p384/bignum_inv_p384.S b/x86_att/p384/bignum_inv_p384.S new file mode 100644 index 00000000000..e1dfecfa2ea --- /dev/null +++ b/x86_att/p384/bignum_inv_p384.S @@ -0,0 +1,1810 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that +// x does not need to be reduced modulo p_384, but the output always is. +// If the input is divisible (i.e. is 0 or p_384), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f 0(%rsp) +#define g (8*N)(%rsp) +#define u (16*N)(%rsp) +#define v (24*N)(%rsp) +#define tmp (32*N)(%rsp) +#define tmp2 (33*N)(%rsp) +#define i (34*N)(%rsp) +#define d (35*N)(%rsp) + +#define mat (36*N)(%rsp) + +// Backup for the input pointer + +#define res (40*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (42*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (8*N) +#define U (16*N) +#define V (24*N) +#define MAT (36*N) + +#define ff (%rsp) +#define gg (8*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from P[6..0] to P[5..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + movq $0xe000000000000000, %r8 ; \ + xorl %eax, %eax ; \ + addq P, %r8 ; \ + movq $0x000000001fffffff, %r9 ; \ + leaq -1(%rax), %rax ; \ + adcq N+P, %r9 ; \ + movq $0xdfffffffe0000000, %r10 ; \ + adcq 2*N+P, %r10 ; \ + movq 3*N+P, %r11 ; \ + adcq %rax, %r11 ; \ + movq 4*N+P, %r12 ; \ + adcq %rax, %r12 ; \ + movq 5*N+P, %r13 ; \ + adcq %rax, %r13 ; \ + movq $0x1fffffffffffffff, %r14 ; \ + adcq 6*N+P, %r14 ; \ +/* Correction multiplier is %rbx = w = [d0 + (d0<<32)] mod 2^64 */ \ + movq %r8, %rbx ; \ + shlq $32, %rbx ; \ + addq %r8, %rbx ; \ +/* Construct [%rbp;%rdx;%rax;-] = (2^384 - p_384) * w */ \ +/* We know lowest word will cancel so can re-use %r8 as a temp */ \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0x00000000ffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ +/* Now subtract that and add 2^384 * w, catching carry in %rax */ \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0, %r12 ; \ + sbbq $0, %r13 ; \ + sbbq $0, %r14 ; \ + sbbq %rax, %rax ; \ + addq %rbx, %r14 ; \ + adcq $0, %rax ; \ +/* Now if top is nonzero we subtract p_384 (almost-Montgomery) */ \ + negq %rax; \ + movq $0x00000000ffffffff, %rbx ; \ + andq %rax, %rbx ; \ + movq $0xffffffff00000000, %rcx ; \ + andq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rdx ; \ + andq %rax, %rdx ; \ + subq %rbx, %r9 ; \ + movq %r9, P ; \ + sbbq %rcx, %r10 ; \ + movq %r10, N+P ; \ + sbbq %rdx, %r11 ; \ + movq %r11, 2*N+P ; \ + sbbq %rax, %r12 ; \ + movq %r12, 3*N+P ; \ + sbbq %rax, %r13 ; \ + movq %r13, 4*N+P ; \ + sbbq %rax, %r14 ; \ + movq %r14, 5*N+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_inv_p384): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Copy the constant p_384 into f including the 7th zero digit + + movl $0xffffffff, %eax + movq %rax, F(%rsp) + movq %rax, %rbx + notq %rbx + movq %rbx, F+N(%rsp) + xorl %ebp, %ebp + leaq -2(%rbp), %rcx + movq %rcx, F+2*N(%rsp) + leaq -1(%rbp), %rdx + movq %rdx, F+3*N(%rsp) + movq %rdx, F+4*N(%rsp) + movq %rdx, F+5*N(%rsp) + movq %rbp, F+6*N(%rsp) + +// Copy input but to g, reduced mod p_384 so that g <= f as assumed +// in the divstep bound proof. + + movq (%rsi), %r8 + subq %rax, %r8 + movq N(%rsi), %r9 + sbbq %rbx, %r9 + movq 2*N(%rsi), %r10 + sbbq %rcx, %r10 + movq 3*N(%rsi), %r11 + sbbq %rdx, %r11 + movq 4*N(%rsi), %r12 + sbbq %rdx, %r12 + movq 5*N(%rsi), %r13 + sbbq %rdx, %r13 + + cmovcq (%rsi), %r8 + cmovcq N(%rsi), %r9 + cmovcq 2*N(%rsi), %r10 + cmovcq 3*N(%rsi), %r11 + cmovcq 4*N(%rsi), %r12 + cmovcq 5*N(%rsi), %r13 + + movq %r8, G(%rsp) + movq %r9, G+N(%rsp) + movq %r10, G+2*N(%rsp) + movq %r11, G+3*N(%rsp) + movq %r12, G+4*N(%rsp) + movq %r13, G+5*N(%rsp) + movq %rbp, G+6*N(%rsp) + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+N(%rsp) + movq %rax, U+2*N(%rsp) + movq %rax, U+3*N(%rsp) + movq %rax, U+4*N(%rsp) + movq %rax, U+5*N(%rsp) + + movl $2048, %ecx + movq %rax, V(%rsp) + movq %rcx, V+N(%rsp) + movq %rax, V+2*N(%rsp) + movq %rax, V+3*N(%rsp) + movq %rax, V+4*N(%rsp) + movq %rax, V+5*N(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + movq $15, i + movq $1, d + jmp midloop + +loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digit 3 of [f,g] + + xorl %ebp, %ebp + movq F+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + + xorl %ecx, %ecx + movq F+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + +// Digit 4 of [f,g] + + xorl %edi, %edi + movq F+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, F+3*N(%rsp) + + xorl %esi, %esi + movq F+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, G+3*N(%rsp) + +// Digits 5 and 6 of [f,g] + + movq F+5*N(%rsp), %rax + xorq %r9, %rax + movq F+6*N(%rsp), %rbx + xorq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+5*N(%rsp), %rax + xorq %r11, %rax + movq G+6*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, F+4*N(%rsp) + shrdq $59, %rbx, %rdi + sarq $59, %rbx + + movq F+5*N(%rsp), %rax + movq %rdi, F+5*N(%rsp) + + movq F+6*N(%rsp), %rdi + movq %rbx, F+6*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rdi + andq %r12, %rdi + negq %rdi + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rdi + movq G+5*N(%rsp), %rax + xorq %r15, %rax + movq G+6*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rdi + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rdi + shrdq $59, %rsi, %rcx + movq %rcx, G+4*N(%rsp) + shrdq $59, %rdi, %rsi + movq %rsi, G+5*N(%rsp) + sarq $59, %rdi + movq %rdi, G+6*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digit 3 of [u,v] + + xorl %ebx, %ebx + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+3*N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+3*N(%rsp) + +// Digit 4 of [u,v] + + xorl %ecx, %ecx + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+4*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+4*N(%rsp) + +// Digits 5 and 6 of u (top is unsigned) + + movq U+5*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+5*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+5*N(%rsp), %rax + movq %rcx, U+5*N(%rsp) + movq %rdx, U+6*N(%rsp) + +// Digits 5 and 6 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+5*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+5*N(%rsp) + movq %rdx, V+6*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + movq %r12, U(%rsp) + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + movq %r13, U+N(%rsp) + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + movq %r14, U+2*N(%rsp) + adcq %rdx, %r15 + +// Digit 3 of [u] + + xorl %r14d, %r14d + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r14 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r15 + movq %r15, U+3*N(%rsp) + adcq %rdx, %r14 + +// Digit 4 of [u] + + xorl %r15d, %r15d + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + movq %r14, U+4*N(%rsp) + adcq %rdx, %r15 + +// Digits 5 and 6 of u (top is unsigned) + + movq U+5*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+5*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + movq %r15, U+5*N(%rsp) + adcq %rdx, %r9 + movq %r9, U+6*N(%rsp) + +// Montgomery reduce u + + amontred(u) + +// Perform final strict reduction mod p_384 and copy to output + + movl $0xffffffff, %eax + movq %rax, %rbx + notq %rbx + xorl %ebp, %ebp + leaq -2(%rbp), %rcx + leaq -1(%rbp), %rdx + + movq U(%rsp), %r8 + subq %rax, %r8 + movq U+N(%rsp), %r9 + sbbq %rbx, %r9 + movq U+2*N(%rsp), %r10 + sbbq %rcx, %r10 + movq U+3*N(%rsp), %r11 + sbbq %rdx, %r11 + movq U+4*N(%rsp), %r12 + sbbq %rdx, %r12 + movq U+5*N(%rsp), %r13 + sbbq %rdx, %r13 + + cmovcq U(%rsp), %r8 + cmovcq U+N(%rsp), %r9 + cmovcq U+2*N(%rsp), %r10 + cmovcq U+3*N(%rsp), %r11 + cmovcq U+4*N(%rsp), %r12 + cmovcq U+5*N(%rsp), %r13 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, N(%rdi) + movq %r10, 2*N(%rdi) + movq %r11, 3*N(%rdi) + movq %r12, 4*N(%rdi) + movq %r13, 5*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/bignum_montinv_p384.S b/x86_att/p384/bignum_montinv_p384.S new file mode 100644 index 00000000000..81928ed59dc --- /dev/null +++ b/x86_att/p384/bignum_montinv_p384.S @@ -0,0 +1,1827 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z +// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function +// gives the analog of the modular inverse bignum_inv_p384 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_384, but the output always is. If the input +// is divisible (i.e. is 0 or p_384), then there can be no solution to +// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f 0(%rsp) +#define g (8*N)(%rsp) +#define u (16*N)(%rsp) +#define v (24*N)(%rsp) +#define tmp (32*N)(%rsp) +#define tmp2 (33*N)(%rsp) +#define i (34*N)(%rsp) +#define d (35*N)(%rsp) + +#define mat (36*N)(%rsp) + +// Backup for the input pointer + +#define res (40*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (42*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (8*N) +#define U (16*N) +#define V (24*N) +#define MAT (36*N) + +#define ff (%rsp) +#define gg (8*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from P[6..0] to P[5..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + movq $0xe000000000000000, %r8 ; \ + xorl %eax, %eax ; \ + addq P, %r8 ; \ + movq $0x000000001fffffff, %r9 ; \ + leaq -1(%rax), %rax ; \ + adcq N+P, %r9 ; \ + movq $0xdfffffffe0000000, %r10 ; \ + adcq 2*N+P, %r10 ; \ + movq 3*N+P, %r11 ; \ + adcq %rax, %r11 ; \ + movq 4*N+P, %r12 ; \ + adcq %rax, %r12 ; \ + movq 5*N+P, %r13 ; \ + adcq %rax, %r13 ; \ + movq $0x1fffffffffffffff, %r14 ; \ + adcq 6*N+P, %r14 ; \ +/* Correction multiplier is %rbx = w = [d0 + (d0<<32)] mod 2^64 */ \ + movq %r8, %rbx ; \ + shlq $32, %rbx ; \ + addq %r8, %rbx ; \ +/* Construct [%rbp;%rdx;%rax;-] = (2^384 - p_384) * w */ \ +/* We know lowest word will cancel so can re-use %r8 as a temp */ \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0x00000000ffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ +/* Now subtract that and add 2^384 * w, catching carry in %rax */ \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0, %r12 ; \ + sbbq $0, %r13 ; \ + sbbq $0, %r14 ; \ + sbbq %rax, %rax ; \ + addq %rbx, %r14 ; \ + adcq $0, %rax ; \ +/* Now if top is nonzero we subtract p_384 (almost-Montgomery) */ \ + negq %rax; \ + movq $0x00000000ffffffff, %rbx ; \ + andq %rax, %rbx ; \ + movq $0xffffffff00000000, %rcx ; \ + andq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rdx ; \ + andq %rax, %rdx ; \ + subq %rbx, %r9 ; \ + movq %r9, P ; \ + sbbq %rcx, %r10 ; \ + movq %r10, N+P ; \ + sbbq %rdx, %r11 ; \ + movq %r11, 2*N+P ; \ + sbbq %rax, %r12 ; \ + movq %r12, 3*N+P ; \ + sbbq %rax, %r13 ; \ + movq %r13, 4*N+P ; \ + sbbq %rax, %r14 ; \ + movq %r14, 5*N+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_montinv_p384): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Copy the constant p_384 into f including the 7th zero digit + + movl $0xffffffff, %eax + movq %rax, F(%rsp) + movq %rax, %rbx + notq %rbx + movq %rbx, F+N(%rsp) + xorl %ebp, %ebp + leaq -2(%rbp), %rcx + movq %rcx, F+2*N(%rsp) + leaq -1(%rbp), %rdx + movq %rdx, F+3*N(%rsp) + movq %rdx, F+4*N(%rsp) + movq %rdx, F+5*N(%rsp) + movq %rbp, F+6*N(%rsp) + +// Copy input but to g, reduced mod p_384 so that g <= f as assumed +// in the divstep bound proof. + + movq (%rsi), %r8 + subq %rax, %r8 + movq N(%rsi), %r9 + sbbq %rbx, %r9 + movq 2*N(%rsi), %r10 + sbbq %rcx, %r10 + movq 3*N(%rsi), %r11 + sbbq %rdx, %r11 + movq 4*N(%rsi), %r12 + sbbq %rdx, %r12 + movq 5*N(%rsi), %r13 + sbbq %rdx, %r13 + + cmovcq (%rsi), %r8 + cmovcq N(%rsi), %r9 + cmovcq 2*N(%rsi), %r10 + cmovcq 3*N(%rsi), %r11 + cmovcq 4*N(%rsi), %r12 + cmovcq 5*N(%rsi), %r13 + + movq %r8, G(%rsp) + movq %r9, G+N(%rsp) + movq %r10, G+2*N(%rsp) + movq %r11, G+3*N(%rsp) + movq %r12, G+4*N(%rsp) + movq %r13, G+5*N(%rsp) + movq %rbp, G+6*N(%rsp) + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 15th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e. +// x * u == 2^768 as required. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+N(%rsp) + movq %rax, U+2*N(%rsp) + movq %rax, U+3*N(%rsp) + movq %rax, U+4*N(%rsp) + movq %rax, U+5*N(%rsp) + +// The starting constant 2^843 mod p_384 is +// 0x0000000000000800:00001000000007ff:fffff00000000000 +// :00001000000007ff:fffff00000000800:0000000000000000 +// where colons separate 64-bit subwords, least significant at the right. +// These are constructed dynamically to reduce large constant loads. + + movq %rax, V(%rsp) + movq $0xfffff00000000800, %rcx + movq %rcx, V+N(%rsp) + movq $0x00001000000007ff, %rdx + movq %rdx, V+2*N(%rsp) + btr $11, %rcx + movq %rcx, V+3*N(%rsp) + movq %rdx, V+4*N(%rsp) + bts $11, %rax + movq %rax, V+5*N(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + movq $15, i + movq $1, d + jmp bignum_montinv_p384_midloop + +bignum_montinv_p384_loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digit 3 of [f,g] + + xorl %ebp, %ebp + movq F+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + + xorl %ecx, %ecx + movq F+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + +// Digit 4 of [f,g] + + xorl %edi, %edi + movq F+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, F+3*N(%rsp) + + xorl %esi, %esi + movq F+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, G+3*N(%rsp) + +// Digits 5 and 6 of [f,g] + + movq F+5*N(%rsp), %rax + xorq %r9, %rax + movq F+6*N(%rsp), %rbx + xorq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+5*N(%rsp), %rax + xorq %r11, %rax + movq G+6*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, F+4*N(%rsp) + shrdq $59, %rbx, %rdi + sarq $59, %rbx + + movq F+5*N(%rsp), %rax + movq %rdi, F+5*N(%rsp) + + movq F+6*N(%rsp), %rdi + movq %rbx, F+6*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rdi + andq %r12, %rdi + negq %rdi + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rdi + movq G+5*N(%rsp), %rax + xorq %r15, %rax + movq G+6*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rdi + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rdi + shrdq $59, %rsi, %rcx + movq %rcx, G+4*N(%rsp) + shrdq $59, %rdi, %rsi + movq %rsi, G+5*N(%rsp) + sarq $59, %rdi + movq %rdi, G+6*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digit 3 of [u,v] + + xorl %ebx, %ebx + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+3*N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+3*N(%rsp) + +// Digit 4 of [u,v] + + xorl %ecx, %ecx + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+4*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+4*N(%rsp) + +// Digits 5 and 6 of u (top is unsigned) + + movq U+5*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+5*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+5*N(%rsp), %rax + movq %rcx, U+5*N(%rsp) + movq %rdx, U+6*N(%rsp) + +// Digits 5 and 6 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+5*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+5*N(%rsp) + movq %rdx, V+6*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +bignum_montinv_p384_midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz bignum_montinv_p384_loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + movq %r12, U(%rsp) + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + movq %r13, U+N(%rsp) + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + movq %r14, U+2*N(%rsp) + adcq %rdx, %r15 + +// Digit 3 of [u] + + xorl %r14d, %r14d + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r14 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r15 + movq %r15, U+3*N(%rsp) + adcq %rdx, %r14 + +// Digit 4 of [u] + + xorl %r15d, %r15d + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + movq %r14, U+4*N(%rsp) + adcq %rdx, %r15 + +// Digits 5 and 6 of u (top is unsigned) + + movq U+5*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+5*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + movq %r15, U+5*N(%rsp) + adcq %rdx, %r9 + movq %r9, U+6*N(%rsp) + +// Montgomery reduce u + + amontred(u) + +// Perform final strict reduction mod p_384 and copy to output + + movl $0xffffffff, %eax + movq %rax, %rbx + notq %rbx + xorl %ebp, %ebp + leaq -2(%rbp), %rcx + leaq -1(%rbp), %rdx + + movq U(%rsp), %r8 + subq %rax, %r8 + movq U+N(%rsp), %r9 + sbbq %rbx, %r9 + movq U+2*N(%rsp), %r10 + sbbq %rcx, %r10 + movq U+3*N(%rsp), %r11 + sbbq %rdx, %r11 + movq U+4*N(%rsp), %r12 + sbbq %rdx, %r12 + movq U+5*N(%rsp), %r13 + sbbq %rdx, %r13 + + cmovcq U(%rsp), %r8 + cmovcq U+N(%rsp), %r9 + cmovcq U+2*N(%rsp), %r10 + cmovcq U+3*N(%rsp), %r11 + cmovcq U+4*N(%rsp), %r12 + cmovcq U+5*N(%rsp), %r13 + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, N(%rdi) + movq %r10, 2*N(%rdi) + movq %r11, 3*N(%rdi) + movq %r12, 4*N(%rdi) + movq %r13, 5*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/x86_att/p384/p384_montjscalarmul.S b/x86_att/p384/p384_montjscalarmul.S index 46fd5ebf4e3..38bea41d878 100644 --- a/x86_att/p384/p384_montjscalarmul.S +++ b/x86_att/p384/p384_montjscalarmul.S @@ -59,6 +59,33 @@ #define NSPACE (56*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock_xz(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 ; \ + cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 ; \ + cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 ; \ + cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 ; \ + cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 ; \ + cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 + +#define selectblock_y(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 + S2N_BN_SYMBOL(p384_montjscalarmul): // The Windows version literally calls the standard ABI version. @@ -93,7 +120,7 @@ p384_montjscalarmul_standard: movq %rdi, res -// Reduce the input scalar mod n_256, i.e. conditionally subtract n_256. +// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. // Store it to "scalarb". movq (%rsi), %r8 @@ -172,70 +199,70 @@ p384_montjscalarmul_standard: leaq TAB+JACSIZE*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*2(%rsp), %rdi leaq TAB+JACSIZE*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*3(%rsp), %rdi leaq TAB+JACSIZE*1(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*4(%rsp), %rdi leaq TAB+JACSIZE*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*5(%rsp), %rdi leaq TAB+JACSIZE*2(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*6(%rsp), %rdi leaq TAB+JACSIZE*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*7(%rsp), %rdi leaq TAB+JACSIZE*3(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*8(%rsp), %rdi leaq TAB+JACSIZE*7(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*9(%rsp), %rdi leaq TAB+JACSIZE*4(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*10(%rsp), %rdi leaq TAB+JACSIZE*9(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*11(%rsp), %rdi leaq TAB+JACSIZE*5(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*12(%rsp), %rdi leaq TAB+JACSIZE*11(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*13(%rsp), %rdi leaq TAB+JACSIZE*6(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq TAB+JACSIZE*14(%rsp), %rdi leaq TAB+JACSIZE*13(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd leaq TAB+JACSIZE*15(%rsp), %rdi leaq TAB+JACSIZE*7(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble // Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed // digits. The digits of the constant, in lowest-to-highest order, are as @@ -306,23 +333,24 @@ p384_montjscalarmul_standard: xorl %r13d, %r13d xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 - cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 - cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 - cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 - cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 - cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 - cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 - .set I, (I+1) -.endr + + selectblock_xz(1) + selectblock_xz(2) + selectblock_xz(3) + selectblock_xz(4) + selectblock_xz(5) + selectblock_xz(6) + selectblock_xz(7) + selectblock_xz(8) + selectblock_xz(9) + selectblock_xz(10) + selectblock_xz(11) + selectblock_xz(12) + selectblock_xz(13) + selectblock_xz(14) + selectblock_xz(15) + selectblock_xz(16) + movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -342,17 +370,24 @@ p384_montjscalarmul_standard: xorl %edx, %edx xorl %r8d, %r8d xorl %r9d, %r9d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 - .set I, (I+1) -.endr + + selectblock_y(1) + selectblock_y(2) + selectblock_y(3) + selectblock_y(4) + selectblock_y(5) + selectblock_y(6) + selectblock_y(7) + selectblock_y(8) + selectblock_y(9) + selectblock_y(10) + selectblock_y(11) + selectblock_y(12) + selectblock_y(13) + selectblock_y(14) + selectblock_y(15) + selectblock_y(16) + movq %rax, ACC+48(%rsp) movq %rbx, ACC+56(%rsp) movq %rcx, ACC+64(%rsp) @@ -366,28 +401,28 @@ p384_montjscalarmul_standard: movl $380, %ebp -loop: +p384_montjscalarmul_mainloop: subq $5, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_p384_montjdouble // Choose the bitfield and adjust it to sign and magnitude @@ -434,23 +469,24 @@ loop: xorl %r13d, %r13d xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 - cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 - cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 - cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 - cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 - cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 - cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 - .set I, (I+1) -.endr + + selectblock_xz(1) + selectblock_xz(2) + selectblock_xz(3) + selectblock_xz(4) + selectblock_xz(5) + selectblock_xz(6) + selectblock_xz(7) + selectblock_xz(8) + selectblock_xz(9) + selectblock_xz(10) + selectblock_xz(11) + selectblock_xz(12) + selectblock_xz(13) + selectblock_xz(14) + selectblock_xz(15) + selectblock_xz(16) + movq %rax, TABENT(%rsp) movq %rbx, TABENT+8(%rsp) movq %rcx, TABENT+16(%rsp) @@ -470,17 +506,23 @@ loop: xorl %edx, %edx xorl %r8d, %r8d xorl %r9d, %r9d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 - .set I, (I+1) -.endr + + selectblock_y(1) + selectblock_y(2) + selectblock_y(3) + selectblock_y(4) + selectblock_y(5) + selectblock_y(6) + selectblock_y(7) + selectblock_y(8) + selectblock_y(9) + selectblock_y(10) + selectblock_y(11) + selectblock_y(12) + selectblock_y(13) + selectblock_y(14) + selectblock_y(15) + selectblock_y(16) // Store it to "tabent" with the y coordinate optionally negated. // This is done carefully to give coordinates < p_384 even in @@ -532,10 +574,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjadd + callq p384_montjscalarmul_p384_montjadd testq %rbp, %rbp - jne loop + jne p384_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -591,7 +633,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p384_montjadd: +p384_montjscalarmul_p384_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -5006,7 +5048,7 @@ local_p384_montjadd: popq %rbx ret -local_p384_montjdouble: +p384_montjscalarmul_p384_montjdouble: pushq %rbx pushq %rbp pushq %r12 diff --git a/x86_att/p384/p384_montjscalarmul_alt.S b/x86_att/p384/p384_montjscalarmul_alt.S index a3920e9a407..c666db6dbe9 100644 --- a/x86_att/p384/p384_montjscalarmul_alt.S +++ b/x86_att/p384/p384_montjscalarmul_alt.S @@ -59,6 +59,33 @@ #define NSPACE (56*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock_xz(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 ; \ + cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 ; \ + cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 ; \ + cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 ; \ + cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 ; \ + cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 + +#define selectblock_y(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 + S2N_BN_SYMBOL(p384_montjscalarmul_alt): // The Windows version literally calls the standard ABI version. @@ -93,7 +120,7 @@ p384_montjscalarmul_alt_standard: movq %rdi, res -// Reduce the input scalar mod n_256, i.e. conditionally subtract n_256. +// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. // Store it to "scalarb". movq (%rsi), %r8 @@ -172,70 +199,70 @@ p384_montjscalarmul_alt_standard: leaq TAB+JACSIZE*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*2(%rsp), %rdi leaq TAB+JACSIZE*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*3(%rsp), %rdi leaq TAB+JACSIZE*1(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*4(%rsp), %rdi leaq TAB+JACSIZE*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*5(%rsp), %rdi leaq TAB+JACSIZE*2(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*6(%rsp), %rdi leaq TAB+JACSIZE*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*7(%rsp), %rdi leaq TAB+JACSIZE*3(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*8(%rsp), %rdi leaq TAB+JACSIZE*7(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*9(%rsp), %rdi leaq TAB+JACSIZE*4(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*10(%rsp), %rdi leaq TAB+JACSIZE*9(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*11(%rsp), %rdi leaq TAB+JACSIZE*5(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*12(%rsp), %rdi leaq TAB+JACSIZE*11(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*13(%rsp), %rdi leaq TAB+JACSIZE*6(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq TAB+JACSIZE*14(%rsp), %rdi leaq TAB+JACSIZE*13(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd leaq TAB+JACSIZE*15(%rsp), %rdi leaq TAB+JACSIZE*7(%rsp), %rsi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble // Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed // digits. The digits of the constant, in lowest-to-highest order, are as @@ -306,23 +333,24 @@ p384_montjscalarmul_alt_standard: xorl %r13d, %r13d xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 - cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 - cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 - cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 - cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 - cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 - cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 - .set I, (I+1) -.endr + + selectblock_xz(1) + selectblock_xz(2) + selectblock_xz(3) + selectblock_xz(4) + selectblock_xz(5) + selectblock_xz(6) + selectblock_xz(7) + selectblock_xz(8) + selectblock_xz(9) + selectblock_xz(10) + selectblock_xz(11) + selectblock_xz(12) + selectblock_xz(13) + selectblock_xz(14) + selectblock_xz(15) + selectblock_xz(16) + movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -342,17 +370,24 @@ p384_montjscalarmul_alt_standard: xorl %edx, %edx xorl %r8d, %r8d xorl %r9d, %r9d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 - .set I, (I+1) -.endr + + selectblock_y(1) + selectblock_y(2) + selectblock_y(3) + selectblock_y(4) + selectblock_y(5) + selectblock_y(6) + selectblock_y(7) + selectblock_y(8) + selectblock_y(9) + selectblock_y(10) + selectblock_y(11) + selectblock_y(12) + selectblock_y(13) + selectblock_y(14) + selectblock_y(15) + selectblock_y(16) + movq %rax, ACC+48(%rsp) movq %rbx, ACC+56(%rsp) movq %rcx, ACC+64(%rsp) @@ -366,28 +401,28 @@ p384_montjscalarmul_alt_standard: movl $380, %ebp -loop: +p384_montjscalarmul_alt_mainloop: subq $5, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjdouble + callq p384_montjscalarmul_alt_p384_montjdouble // Choose the bitfield and adjust it to sign and magnitude @@ -434,23 +469,24 @@ loop: xorl %r13d, %r13d xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+8(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+16(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+24(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+32(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+40(%rsp), %r9 - cmovzq TAB+JACSIZE*(I-1)+96(%rsp), %r10 - cmovzq TAB+JACSIZE*(I-1)+104(%rsp), %r11 - cmovzq TAB+JACSIZE*(I-1)+112(%rsp), %r12 - cmovzq TAB+JACSIZE*(I-1)+120(%rsp), %r13 - cmovzq TAB+JACSIZE*(I-1)+128(%rsp), %r14 - cmovzq TAB+JACSIZE*(I-1)+136(%rsp), %r15 - .set I, (I+1) -.endr + +selectblock_xz(1) + selectblock_xz(2) + selectblock_xz(3) + selectblock_xz(4) + selectblock_xz(5) + selectblock_xz(6) + selectblock_xz(7) + selectblock_xz(8) + selectblock_xz(9) + selectblock_xz(10) + selectblock_xz(11) + selectblock_xz(12) + selectblock_xz(13) + selectblock_xz(14) + selectblock_xz(15) + selectblock_xz(16) + movq %rax, TABENT(%rsp) movq %rbx, TABENT+8(%rsp) movq %rcx, TABENT+16(%rsp) @@ -470,17 +506,23 @@ loop: xorl %edx, %edx xorl %r8d, %r8d xorl %r9d, %r9d - .set I, 1 -.rep 16 - cmpq $I, %rdi - cmovzq TAB+JACSIZE*(I-1)+48(%rsp), %rax - cmovzq TAB+JACSIZE*(I-1)+56(%rsp), %rbx - cmovzq TAB+JACSIZE*(I-1)+64(%rsp), %rcx - cmovzq TAB+JACSIZE*(I-1)+72(%rsp), %rdx - cmovzq TAB+JACSIZE*(I-1)+80(%rsp), %r8 - cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 - .set I, (I+1) -.endr + + selectblock_y(1) + selectblock_y(2) + selectblock_y(3) + selectblock_y(4) + selectblock_y(5) + selectblock_y(6) + selectblock_y(7) + selectblock_y(8) + selectblock_y(9) + selectblock_y(10) + selectblock_y(11) + selectblock_y(12) + selectblock_y(13) + selectblock_y(14) + selectblock_y(15) + selectblock_y(16) // Store it to "tabent" with the y coordinate optionally negated. // This is done carefully to give coordinates < p_384 even in @@ -532,10 +574,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p384_montjadd + callq p384_montjscalarmul_alt_p384_montjadd testq %rbp, %rbp - jne loop + jne p384_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -591,7 +633,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p384_montjadd: +p384_montjscalarmul_alt_p384_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -6422,7 +6464,7 @@ local_p384_montjadd: popq %rbx ret -local_p384_montjdouble: +p384_montjscalarmul_alt_p384_montjdouble: pushq %rbx pushq %rbp pushq %r12