aws · nebeid · Aug 17, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 7, 2023
@@ -229,6 +229,20 @@ if((ARCH STREQUAL "x86_64" OR ARCH STREQUAL "aarch64") AND
                 curve25519/curve25519_x25519base_byte.S
                 curve25519/curve25519_x25519base_byte_alt.S
                 )
+
+    # Big integer arithmetics using s2n-bignum
+    list(APPEND S2N_BIGNUM_ASM_SOURCES
+                fastmul/bignum_kmul_16_32.S
+                fastmul/bignum_kmul_32_64.S
+                fastmul/bignum_ksqr_16_32.S
+                fastmul/bignum_ksqr_32_64.S
+                fastmul/bignum_emontredc_8n.S
+
+                generic/bignum_ge.S
+                generic/bignum_mul.S
+                generic/bignum_optsub.S
+                generic/bignum_sqr.S
+                )
   endif()
 endif()
 

@@ -119,8 +119,47 @@
 #include <openssl/type_check.h>
 
 #include "internal.h"
+#include "../cpucap/internal.h"
 #include "../../internal.h"
 
+#if !defined(OPENSSL_NO_ASM) &&                                                \
+    (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) &&                      \
+    defined(OPENSSL_AARCH64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) &&   \
+    defined(OPENSSL_BN_ASM_MONT)
+
+#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
+
+#define BN_MONTGOMERY_USE_S2N_BIGNUM 1
+
+#endif
+
+OPENSSL_INLINE int montgomery_s2n_bignum_capable(void) {
+#if defined(BN_MONTGOMERY_USE_S2N_BIGNUM)
+
+  return 1;
+
+#else
+
+  return 0;
+
+#endif
+}
+
+OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
+#if defined(BN_MONTGOMERY_USE_S2N_BIGNUM)
+
+  // Use s2n-bignum's functions only if (1) the ARM architecture has slow
+  // multipliers, and (2) temporary buffer's size does not exceed
+  // BN_MONTGOMERY_MAX_WORDS.
+  return !CRYPTO_is_ARMv8_wide_multiplier_capable() && (num % 8 == 0) &&
+         BN_BITS2 == 64 && (2 * (uint64_t)num + 96) <= BN_MONTGOMERY_MAX_WORDS;
+
+#else
+
+  return 0;
+
+#endif
+}
 
 BN_MONT_CTX *BN_MONT_CTX_new(void) {
   BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
@@ -418,6 +457,77 @@ static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
   return ret;
 }
 
+
+#if defined(OPENSSL_BN_ASM_MONT)
+
+static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
+                                           const BN_ULONG *bp,
+                                           const BN_ULONG *np,
+                                           const BN_ULONG *n0, size_t num) {
+
+#if defined(BN_MONTGOMERY_USE_S2N_BIGNUM)
+
+  // t is a temporary buffer used by big-int multiplication.
+  // bignum_kmul_32_64 requires 96 words at maximum.
+  uint64_t t[96];
+  // l is the output buffer of big-int multiplication.
+  // Its low num*2 elements are used.
+  // It is montgomery_use_s2n_bignum() that checks whether num*2 fits in the
+  // size of mulres array.
+  uint64_t mulres[BN_MONTGOMERY_MAX_WORDS - 96];
+
+  // BN_ULONG is uint64_t since BN_BITS2 is 64.
+  // m is the prime number, and m * w = -1 mod 2^64.
+  uint64_t *m = (uint64_t *)np;
+  uint64_t w = (uint64_t)n0[0];
+  uint64_t *src = (uint64_t *)ap, *src2 = (uint64_t *)bp;
+  uint64_t *dest = (uint64_t *)rp;
+  uint64_t c;
+
+  if (num == 32) {
+    if (ap == bp)
+      bignum_ksqr_32_64(mulres, src, t);
+    else
+      bignum_kmul_32_64(mulres, src2, src, t);
+  } else if (num == 16) {
+    if (ap == bp)
+      bignum_ksqr_16_32(mulres, src, t);
+    else
+      bignum_kmul_16_32(mulres, src2, src, t);
+  } else {
+    if (ap == bp)
+      bignum_sqr(num * 2, mulres, num, src);
+    else
+      bignum_mul(num * 2, mulres, num, src2, num, src);
+  }
+
+  // Do montgomery reduction. We follow the definition of montgomery reduction
+  // which is:
+  // 1. Calculate (mulres + ((mulres mod R) * (-m^-1 mod R) mod R) * m) / R
+  // using
+  //    bignum_emontredc_8n, where R is 2^(64*num).
+  //    The calculated result is stored in the upper half elements of the mulres
+  //    buffer. If the result overflows num words, bignum_emontredc_8n
+  //    returns 1.
+  // 2. Optionally subtract the result if the (result of step 1) >= m.
+  //    The comparison is true if (1) there is an overflow (bignum_emontredc_8n
+  //    returns 1), or (2) the upper half mulres is larger than m.
+  c = bignum_emontredc_8n(num, mulres, m, w);
+  c |= bignum_ge(num, mulres + num, num, m);
+  // Do the step 2 and store the result at dest (which is rp)
+  bignum_optsub(num, dest, mulres + num, c, m);
+
+#else
+
+  // Should not call this function unless s2n-bignum is supported.
+  abort();
+
+#endif
+}
+
+#endif
+
+
 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                           const BN_MONT_CTX *mont, BN_CTX *ctx) {
   if (a->neg || b->neg) {
@@ -437,11 +547,18 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
     // This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
     // allocates |num| words on the stack, so |num| cannot be too large.
     assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
-    if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
-      // The check above ensures this won't happen.
-      assert(0);
-      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-      return 0;
+
+    if (montgomery_s2n_bignum_capable() && montgomery_use_s2n_bignum(num)) {
+      // Do montgomery multiplication using s2n-bignum.
+      montgomery_s2n_bignum_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0,
+                                     num);
+    } else {
+      if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+        // The check above ensures this won't happen.
+        assert(0);
+        OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
+        return 0;
+      }
     }
     r->neg = 0;
     r->width = num;

@@ -130,3 +130,53 @@ extern void curve25519_x25519_byte_alt(uint8_t res[static 32], const uint8_t sca
 // another u-coordinate, is saved in |res|.
 extern void curve25519_x25519base_byte(uint8_t res[static 32], const uint8_t scalar[static 32]);
 extern void curve25519_x25519base_byte_alt(uint8_t res[static 32], const uint8_t scalar[static 32]);
+
+// Evaluate z := x^2 where x is a 2048-bit integer.
+// Input: x[32]; output: z[64]; temporary buffer: t[>=72]
+extern void bignum_ksqr_32_64(uint64_t z[static 64], uint64_t x[static 32],
+                              uint64_t t[static 72]);
+
+// Evaluate z := x^2 where x is a 1024-bit integer.
+// Input: x[16]; output: z[32]; temporary buffer: t[>=24]
+extern void bignum_ksqr_16_32(uint64_t z[static 32], uint64_t x[static 16],
+                              uint64_t t[static 24]);
+
+// Evaluate z := x * y where x and y are 2048-bit integers.
+// Inputs: x[32], y[32]; output: z[64]; temporary buffer t[>=96]
+extern void bignum_kmul_32_64(uint64_t z[static 64], uint64_t x[static 32],
+                              uint64_t y[static 32], uint64_t t[static 96]);
+
+// Evaluate z := x * y where x and y are 1024-bit integers.
+// Inputs: x[16], y[16]; output: z[32]; temporary buffer t[>=32]
+extern void bignum_kmul_16_32(uint64_t z[static 32], uint64_t x[static 16],
+                              uint64_t y[static 16], uint64_t t[static 32]);
+
+// Extended Montgomery reduce in 8-digit blocks.
+// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
+// bignum and m * w == -1 (mod 2^64). This function also uses z for the output
+// as well as returning a carry c of 0 or 1. This encodes two numbers: in the
+// lower half of the z buffer we have q = z[0..k-1], while the upper half
+// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values
+// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery
+// reduction while q gives the multiplier that was used.
+// Inputs: z[2*k], m[k], w; outputs: function return (extra result bit) and z[2*k]
+extern uint64_t bignum_emontredc_8n(uint64_t k, uint64_t *z, uint64_t *m,
+                                    uint64_t w);
+
+// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
+// Inputs: x[k], p, y[k]; outputs: function return (carry-out) and z[k]
+extern uint64_t bignum_optsub(uint64_t k, uint64_t *z, uint64_t *x, uint64_t p,
+                              uint64_t *y);
+
+// Compare bignums, x >= y.
+// Inputs: x[m], y[n]; output: function return (1 if x >= y)
+extern uint64_t bignum_ge(uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+
+// General big-integer multiplication (z := x * y).
+// Inputs: x[m], y[n]; output: z[k]. If k < m+n, the result is truncated.
+extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, uint64_t *x,
+                       uint64_t n, uint64_t *y);
+
+// General big-integer squaring (z := x^2).
+// Inputs: x[m]; output: z[k]. If k < 2m, the result is truncated.
+extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t m, uint64_t *x);