aws · dkostic · Jul 24, 2024 · Aug 28, 2023 · Sep 16, 2023 · Sep 16, 2023
@@ -1242,7 +1242,7 @@ curve25519_x25519_scalarloop:
         usra    v20.2d, v25.2d, #25
         and     v27.16b, v25.16b, v23.16b       // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5
         bfi     x17, x7, #32, #25               // ubignum_of_preglist 1 // INTERMEDIATE z4
-        mov     v5.d[0], x3                     // depth 86
+        mov     v5.d[0], x3
         mov     v1.d[0], x5                     // FINAL z2
         usra    v26.2d, v20.2d, #26             // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5
         and     v28.16b, v20.16b, v30.16b       // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5

@@ -593,8 +593,7 @@ curve25519_x25519_alt_scalarloop:
 
 // Multiplex directly into (xn,zn) then do three pure doubling steps;
 // this accounts for the implicit zeroing of the three lowest bits
-// of the scalar. On the very last doubling we *fully* reduce zn mod
-// p_25519 to ease checking for degeneracy below.
+// of the scalar.
 
         cmp     swap, xzr
         mux_4(xn,xm,xn)
@@ -631,20 +630,20 @@ curve25519_x25519_alt_scalarloop:
         orr     x1, x1, 0x10000
         cmadd_4(e,p,d)
         mul_4(xn,s,d)
-        mul_p25519(zn,p,e)
+        mul_4(zn,p,e)
 
 // The projective result of the scalar multiplication is now (xn,zn).
-// Prepare to call the modular inverse function to get xm = 1/zn
+// Prepare to call the modular inverse function to get zn' = 1/zn
 
-        add     x0, xm
+        add     x0, zn
         add     x1, zn
 
 // Inline copy of bignum_inv_p25519, identical except for stripping out
 // the prologue and epilogue saving and restoring registers and making
 // and reclaiming room on the stack. For more details and explanations see
 // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for
 // its own temporaries is 128 bytes, so it has no effect on variables
-// that are needed in the rest of our computation here: res, xm and zn.
+// that are needed in the rest of our computation here: res, xn and zn.
 
         mov     x20, x0
         mov     x10, #0xffffffffffffffed
@@ -1675,28 +1674,13 @@ curve25519_x25519_alt_invmidloop:
         stp     x0, x1, [x4]
         stp     x2, x5, [x4, #16]
 
-// Since we eventually want to return 0 when the result is the point at
-// infinity, we force xn = 0 whenever zn = 0. This avoids building in a
-// dependency on the behavior of modular inverse in out-of-scope cases.
-
-        ldp     x0, x1, [zn]
-        ldp     x2, x3, [zn+16]
-        orr     x0, x0, x1
-        orr     x2, x2, x3
-        orr     x4, x0, x2
-        cmp     x4, xzr
-        ldp     x0, x1, [xn]
-        csel    x0, x0, xzr, ne
-        csel    x1, x1, xzr, ne
-        ldp     x2, x3, [xn+16]
-        stp     x0, x1, [xn]
-        csel    x2, x2, xzr, ne
-        csel    x3, x3, xzr, ne
-        stp     x2, x3, [xn+16]
-
 // Now the result is xn * (1/zn), fully reduced modulo p.
+// Note that in the degenerate case zn = 0 (mod p_25519), the
+// modular inverse code above will produce 1/zn = 0, giving
+// the correct overall X25519 result of zero for the point at
+// infinity.
 
-        mul_p25519(resx,xn,xm)
+        mul_p25519(resx,xn,zn)
 
 // Restore stack and registers
 

@@ -1360,7 +1360,7 @@ curve25519_x25519_byte_scalarloop:
         usra    v20.2d, v25.2d, #25
         and     v27.16b, v25.16b, v23.16b       // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5
         bfi     x17, x7, #32, #25               // ubignum_of_preglist 1 // INTERMEDIATE z4
-        mov     v5.d[0], x3                     // depth 86
+        mov     v5.d[0], x3
         mov     v1.d[0], x5                     // FINAL z2
         usra    v26.2d, v20.2d, #26             // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5
         and     v28.16b, v20.16b, v30.16b       // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5

@@ -711,8 +711,7 @@ curve25519_x25519_byte_alt_scalarloop:
 
 // Multiplex directly into (xn,zn) then do three pure doubling steps;
 // this accounts for the implicit zeroing of the three lowest bits
-// of the scalar. On the very last doubling we *fully* reduce zn mod
-// p_25519 to ease checking for degeneracy below.
+// of the scalar.
 
         cmp     swap, xzr
         mux_4(xn,xm,xn)
@@ -749,20 +748,20 @@ curve25519_x25519_byte_alt_scalarloop:
         orr     x1, x1, 0x10000
         cmadd_4(e,p,d)
         mul_4(xn,s,d)
-        mul_p25519(zn,p,e)
+        mul_4(zn,p,e)
 
 // The projective result of the scalar multiplication is now (xn,zn).
-// Prepare to call the modular inverse function to get xm = 1/zn
+// Prepare to call the modular inverse function to get zn' = 1/zn
 
-        add     x0, xm
+        add     x0, zn
         add     x1, zn
 
 // Inline copy of bignum_inv_p25519, identical except for stripping out
 // the prologue and epilogue saving and restoring registers and making
 // and reclaiming room on the stack. For more details and explanations see
 // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for
 // its own temporaries is 128 bytes, so it has no effect on variables
-// that are needed in the rest of our computation here: res, xm and zn.
+// that are needed in the rest of our computation here: res, xn and zn.
 
         mov     x20, x0
         mov     x10, #0xffffffffffffffed
@@ -1793,28 +1792,13 @@ curve25519_x25519_byte_alt_invmidloop:
         stp     x0, x1, [x4]
         stp     x2, x5, [x4, #16]
 
-// Since we eventually want to return 0 when the result is the point at
-// infinity, we force xn = 0 whenever zn = 0. This avoids building in a
-// dependency on the behavior of modular inverse in out-of-scope cases.
-
-        ldp     x0, x1, [zn]
-        ldp     x2, x3, [zn+16]
-        orr     x0, x0, x1
-        orr     x2, x2, x3
-        orr     x4, x0, x2
-        cmp     x4, xzr
-        ldp     x0, x1, [xn]
-        csel    x0, x0, xzr, ne
-        csel    x1, x1, xzr, ne
-        ldp     x2, x3, [xn+16]
-        stp     x0, x1, [xn]
-        csel    x2, x2, xzr, ne
-        csel    x3, x3, xzr, ne
-        stp     x2, x3, [xn+16]
-
 // Now the result is xn * (1/zn), fully reduced modulo p.
+// Note that in the degenerate case zn = 0 (mod p_25519), the
+// modular inverse code above will produce 1/zn = 0, giving
+// the correct overall X25519 result of zero for the point at
+// infinity.
 
-        mul_p25519(zn,xn,xm)
+        mul_p25519(zn,xn,zn)
 
         ldp     x10, x11, [zn]
         strb    w10, [resx]

@@ -1,5 +1,5 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
 
 // ----------------------------------------------------------------------------
 // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]

@@ -1,5 +1,5 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
 
 // ----------------------------------------------------------------------------
 // Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1]

@@ -1,5 +1,5 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
 
 // ----------------------------------------------------------------------------
 // Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1]

@@ -1,9 +1,9 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0 OR ISC
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
 
 // ----------------------------------------------------------------------------
 // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
-// into z[0..width-1]. width must be a mutiple of 8.
+// into z[0..width-1]. width must be a multiple of 8.
 // This function is constant-time with respect to the value of `idx`. This is
 // achieved by reading the whole table and using the bit-masking to get the
 // `idx`-th row.

@@ -35,8 +35,10 @@ OBJ = bignum_add_p384.o \
       bignum_mod_p384_6.o \
       bignum_montmul_p384.o \
       bignum_montmul_p384_alt.o \
+      bignum_montmul_p384_neon.o \
       bignum_montsqr_p384.o \
       bignum_montsqr_p384_alt.o \
+      bignum_montsqr_p384_neon.o \
       bignum_mux_6.o \
       bignum_neg_p384.o \
       bignum_nonzero_6.o \
@@ -45,8 +47,11 @@ OBJ = bignum_add_p384.o \
       bignum_tomont_p384.o \
       bignum_triple_p384.o \
       p384_montjadd.o \
+      p384_montjadd_alt.o \
       p384_montjdouble.o \
-      p384_montjmixadd.o
+      p384_montjdouble_alt.o \
+      p384_montjmixadd.o \
+      p384_montjmixadd_alt.o
 
 %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ -