Skip to content

Commit

Permalink
Update montgomery multiplication to use s2n-bignum's verified scalar …
Browse files Browse the repository at this point in the history
…bignum functions (#1135)

This is only adopted for AArch64 that has narrow multiplication instruction bandwidths.
To selectively apply s2n-bignum to Graviton 2, this patch invokes CRYPTO_is_ARMv8_wide_multiplier_capable() and runs the s2n-bignum functions only when wide multipliers are not capable.

The performance numbers of RSA signing are as follows. Graviton 2 is used, and `tool/bssl speed -filter RSA` has been used. (Unit: ops/sec).

Bits | Operation | baseline AWS-LC | s2n-bignum | speedup vs baseline
-- | -- | -- | -- | --
2048 | RSA sign           |   299.3 |   399   | 33.31%
     | verify (fresh key) | 10736.3 | 15491   | 44.29%
3072 | RSA sign           |    95.4 |   113.2 | 18.66%
     | verify (fresh key) |  4917.7 |  6001.7 | 22.04%
4096 | RSA sign           |    41.7 |    63.2 | 51.56%
     | verify (fresh key) |  2781.6    3451   | 24.07%
  • Loading branch information
aqjune-aws authored Aug 17, 2023
1 parent 61370c2 commit b706d7e
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 18 deletions.
10 changes: 7 additions & 3 deletions crypto/curve25519/curve25519.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@
#include "../internal.h"
#include "../fipsmodule/cpucap/internal.h"

#if (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
!defined(OPENSSL_NO_ASM) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
// If (1) x86_64 or aarch64, (2) linux or apple, and (3) OPENSSL_NO_ASM is not
// set, s2n-bignum path is capable.
#if ((defined(OPENSSL_X86_64) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)) || \
defined(OPENSSL_AARCH64)) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
!defined(OPENSSL_NO_ASM)
#include "../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
#define CURVE25519_S2N_BIGNUM_CAPABLE
#endif
Expand Down
19 changes: 17 additions & 2 deletions crypto/fipsmodule/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ endif()

# s2n-bignum files can be compiled on Unix platforms only (except Apple),
# and on x86_64 and aarch64 systems only.
if((ARCH STREQUAL "x86_64" OR ARCH STREQUAL "aarch64") AND
UNIX AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) OR
ARCH STREQUAL "aarch64") AND
UNIX)

# Set the source directory for s2n-bignum assembly files
if(ARCH STREQUAL "x86_64")
Expand Down Expand Up @@ -229,6 +230,20 @@ if((ARCH STREQUAL "x86_64" OR ARCH STREQUAL "aarch64") AND
curve25519/curve25519_x25519base_byte.S
curve25519/curve25519_x25519base_byte_alt.S
)

# Big integer arithmetics using s2n-bignum
list(APPEND S2N_BIGNUM_ASM_SOURCES
fastmul/bignum_kmul_16_32.S
fastmul/bignum_kmul_32_64.S
fastmul/bignum_ksqr_16_32.S
fastmul/bignum_ksqr_32_64.S
fastmul/bignum_emontredc_8n.S

generic/bignum_ge.S
generic/bignum_mul.S
generic/bignum_optsub.S
generic/bignum_sqr.S
)
endif()
endif()

Expand Down
120 changes: 115 additions & 5 deletions crypto/fipsmodule/bn/montgomery.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,37 @@
#include <openssl/type_check.h>

#include "internal.h"
#include "../cpucap/internal.h"
#include "../../internal.h"

#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
defined(OPENSSL_AARCH64) && defined(OPENSSL_BN_ASM_MONT)

#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"

#define BN_MONTGOMERY_S2N_BIGNUM_CAPABLE 1

OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
// Use s2n-bignum's functions only if
// (1) The ARM architecture has slow multipliers, and
// (2) num (which is the number of words) is multiplie of 8, because
// s2n-bignum's bignum_emontredc_8n requires it, and
// (3) The word size is 64 bits.
assert(S2NBIGNUM_KSQR_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
S2NBIGNUM_KSQR_32_64_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
S2NBIGNUM_KMUL_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS);
assert(BN_BITS2 == 64);
return !CRYPTO_is_ARMv8_wide_multiplier_capable() && (num % 8 == 0);
}

#else

OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
return 0;
}

#endif

BN_MONT_CTX *BN_MONT_CTX_new(void) {
BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
Expand Down Expand Up @@ -418,6 +447,80 @@ static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
return ret;
}


#if defined(OPENSSL_BN_ASM_MONT)

// Perform montgomery multiplication using s2n-bignum functions. The arguments
// are equivalent to the arguments of bn_mul_mont.
// montgomery_s2n_bignum_mul_mont works only if num is a multiple of 8.
// montgomery_use_s2n_bignum(num) must be called in advance to check this
// condition.
// For num = 32 or num = 16, this uses faster primitives in s2n-bignum.
// montgomery_s2n_bignum_mul_mont allocates S2NBIGNUM_KMUL_32_64_TEMP_NWORDS +
// 2 * BN_MONTGOMERY_MAX_WORDS uint64_t words at the stack.
static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
const BN_ULONG *bp,
const BN_ULONG *np,
const BN_ULONG *n0, size_t num) {

#if defined(BN_MONTGOMERY_S2N_BIGNUM_CAPABLE)

// t is a temporary buffer used by Karatsuba multiplication.
// bignum_kmul_32_64 requires S2NBIGNUM_KMUL_32_64_TEMP_NWORDS words.
uint64_t t[S2NBIGNUM_KMUL_32_64_TEMP_NWORDS];
// mulres is the output buffer of big-int multiplication which uses
// 2 * num elements of mulres. Note that num <= BN_MONTGOMERY_MAX_WORDS
// is guaranteed by the caller (BN_mod_mul_montgomery).
uint64_t mulres[2 * BN_MONTGOMERY_MAX_WORDS];

// Given m the prime number stored at np, m * w = -1 mod 2^64.
uint64_t w = n0[0];

if (num == 32) {
if (ap == bp)
bignum_ksqr_32_64(mulres, ap, t);
else
bignum_kmul_32_64(mulres, ap, bp, t);
} else if (num == 16) {
if (ap == bp)
bignum_ksqr_16_32(mulres, ap, t);
else
bignum_kmul_16_32(mulres, ap, bp, t);
} else {
if (ap == bp)
bignum_sqr(num * 2, mulres, num, ap);
else
bignum_mul(num * 2, mulres, num, ap, num, bp);
}

// Do montgomery reduction. We follow the definition of montgomery reduction
// which is:
// 1. Calculate (mulres + ((mulres mod R) * (-m^-1 mod R) mod R) * m) / R
// using bignum_emontredc_8n, where R is 2^(64*num).
// The calculated result is stored in [mulres+num ... mulres+2*num-1]. If
// the result >= 2^(64*num), bignum_emontredc_8n returns 1.
// 2. Optionally subtract the result if the (result of step 1) >= m.
// The comparison is true if either A or B holds:
// A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n
// returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds.
// B. The result of step 1 fits in 2^(64*num), and the result >= m.
uint64_t c;
c = bignum_emontredc_8n(num, mulres, np, w); // c: case A
c |= bignum_ge(num, mulres + num, num, np); // c: case B
// Optionally subtract and store the result at rp
bignum_optsub(num, rp, mulres + num, c, np);

#else

// Should not call this function unless s2n-bignum is supported.
abort();

#endif
}

#endif


int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BN_MONT_CTX *mont, BN_CTX *ctx) {
if (a->neg || b->neg) {
Expand All @@ -437,11 +540,18 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
// This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
// allocates |num| words on the stack, so |num| cannot be too large.
assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
// The check above ensures this won't happen.
assert(0);
OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
return 0;

if (montgomery_use_s2n_bignum(num)) {
// Do montgomery multiplication using s2n-bignum.
montgomery_s2n_bignum_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0,
num);
} else {
if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
// The check above ensures this won't happen.
assert(0);
OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
return 0;
}
}
r->neg = 0;
r->width = num;
Expand Down
10 changes: 6 additions & 4 deletions crypto/fipsmodule/ec/p384.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
// #define p384_felem_add(out, in0, in1) bignum_add_p384(out, in0, in1)
// when s2n-bignum is used.
//
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
(defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
// If (1) x86_64 or aarch64, (2) linux or apple, and (3) OPENSSL_NO_ASM is not
// set, s2n-bignum path is capable.
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
((defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)) || \
defined(OPENSSL_AARCH64))

# include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"

Expand Down
10 changes: 6 additions & 4 deletions crypto/fipsmodule/ec/p521.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@
// when Fiat-crypto is used, or as:
// #define p521_felem_add(out, in0, in1) bignum_add_p521(out, in0, in1)
// when s2n-bignum is used.
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
(defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
// If (1) x86_64 or aarch64, (2) linux or apple, and (3) OPENSSL_NO_ASM is not
// set, s2n-bignum path is capable.
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \
((defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)) || \
defined(OPENSSL_AARCH64))

# include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
# define P521_USE_S2N_BIGNUM_FIELD_ARITH 1
Expand Down
66 changes: 66 additions & 0 deletions third_party/s2n-bignum/include/s2n-bignum_aws-lc.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,69 @@ extern void curve25519_x25519_byte_alt(uint8_t res[static 32], const uint8_t sca
// another u-coordinate, is saved in |res|.
extern void curve25519_x25519base_byte(uint8_t res[static 32], const uint8_t scalar[static 32]);
extern void curve25519_x25519base_byte_alt(uint8_t res[static 32], const uint8_t scalar[static 32]);

// Evaluate z := x^2 where x is a 2048-bit integer.
// Input: x[32]; output: z[64]; temporary buffer: t[>=72]
#define S2NBIGNUM_KSQR_32_64_TEMP_NWORDS 72
extern void
bignum_ksqr_32_64(uint64_t z[static 64], const uint64_t x[static 32],
uint64_t t[static S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]);

// Evaluate z := x^2 where x is a 1024-bit integer.
// Input: x[16]; output: z[32]; temporary buffer: t[>=24]
#define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24
extern void
bignum_ksqr_16_32(uint64_t z[static 32], const uint64_t x[static 16],
uint64_t t[static S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]);

// Evaluate z := x * y where x and y are 2048-bit integers.
// Inputs: x[32], y[32]; output: z[64]; temporary buffer t[>=96]
#define S2NBIGNUM_KMUL_32_64_TEMP_NWORDS 96
extern void
bignum_kmul_32_64(uint64_t z[static 64], const uint64_t x[static 32],
const uint64_t y[static 32],
uint64_t t[static S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]);

// Evaluate z := x * y where x and y are 1024-bit integers.
// Inputs: x[16], y[16]; output: z[32]; temporary buffer t[>=32]
#define S2NBIGNUM_KMUL_16_32_TEMP_NWORDS 32
extern void
bignum_kmul_16_32(uint64_t z[static 32], const uint64_t x[static 16],
const uint64_t y[static 16],
uint64_t t[static S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]);

// Extended Montgomery reduce in 8-digit blocks.
// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd
// bignum and m * w == -1 (mod 2^64). This function also uses z for the output
// as well as returning a carry c of 0 or 1. This encodes two numbers: in the
// lower half of the z buffer we have q = z[0..k-1], while the upper half
// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values
// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery
// reduction while q gives the multiplier that was used.
// Note that q = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k}.
// z_0 + q * m = 0 mod 2^{64k}
// q * m = -z_0 mod 2^{64k}
// q = -z_0 * m^-1 mod 2^{64k}
// = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k}
// q is uniquely determined because q must be in the range of [0, 2^{64k}-1].
// Inputs: z[2*k], m[k], w; outputs: function return (extra result bit) and z[2*k]
extern uint64_t bignum_emontredc_8n(uint64_t k, uint64_t *z, const uint64_t *m,
uint64_t w);

// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero)
// Inputs: x[k], p, y[k]; outputs: function return (carry-out) and z[k]
extern uint64_t bignum_optsub(uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p,
const uint64_t *y);

// Compare bignums, x >= y.
// Inputs: x[m], y[n]; output: function return (1 if x >= y)
extern uint64_t bignum_ge(uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y);

// General big-integer multiplication (z := x * y).
// Inputs: x[m], y[n]; output: z[k]. If k < m+n, the result is truncated.
extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x,
uint64_t n, const uint64_t *y);

// General big-integer squaring (z := x^2).
// Inputs: x[m]; output: z[k]. If k < 2m, the result is truncated.
extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x);

0 comments on commit b706d7e

Please sign in to comment.