Skip to content

Commit

Permalink
Merge pull request aws#81 from aqjune-aws/tablelookup
Browse files Browse the repository at this point in the history
Add bignum_copy_row_from_table and its Neon-variants for AArch64
s2n-bignum original commit: awslabs/s2n-bignum@50aa85b
  • Loading branch information
jargh authored Sep 16, 2023
2 parents c9c7e11 + 882ea11 commit 9130626
Show file tree
Hide file tree
Showing 4 changed files with 490 additions and 0 deletions.
81 changes: 81 additions & 0 deletions arm/generic/bignum_copy_row_from_table.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
// into z[0..width-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width,
// uint64_t idx);
//
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table)
.text
.balign 4

#define z x0
#define table x1
#define height x2
#define width x3
#define idx x4

#define i x5
#define mask x6
#define j x7

S2N_BN_SYMBOL(bignum_copy_row_from_table):

cbz height, bignum_copy_row_from_table_end
cbz width, bignum_copy_row_from_table_end
mov i, width
mov x6, z

bignum_copy_row_from_table_initzero:
str xzr, [x6]
add x6, x6, #8
subs i, i, #1
bne bignum_copy_row_from_table_initzero

mov i, xzr
mov x8, table

bignum_copy_row_from_table_outerloop:

cmp i, idx
csetm mask, eq

mov j, width
mov x9, z

bignum_copy_row_from_table_innerloop:

ldr x10, [x8]
ldr x11, [x9]
and x10, x10, mask
orr x11, x11, x10
str x11, [x9]

add x8, x8, #8
add x9, x9, #8
subs j, j, #1
bne bignum_copy_row_from_table_innerloop

bignum_copy_row_from_table_innerloop_done:
add i, i, #1
cmp i, height
bne bignum_copy_row_from_table_outerloop

bignum_copy_row_from_table_end:
ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
126 changes: 126 additions & 0 deletions arm/generic/bignum_copy_row_from_table_16_neon.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1]
// into z[0..row-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table_16_neon
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
//
// Initial version written by Hanno Becker
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16_neon)
.text
.balign 4


// *****************************************************
// Main code
// *****************************************************

#define z x0
#define tbl x1
#define height x2
#define idx x3

#define mask x5
#define cnt x6

#define ventry0 v20
#define qentry0 q20
#define ventry1 v21
#define qentry1 q21
#define ventry2 v22
#define qentry2 q22
#define ventry3 v23
#define qentry3 q23
#define ventry4 v24
#define qentry4 q24
#define ventry5 v25
#define qentry5 q25
#define ventry6 v26
#define qentry6 q26
#define ventry7 v27
#define qentry7 q27
#define ventry8 v28

#define vtmp v16
#define qtmp q16

#define vmask v17

S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon):

// Clear accumulator
// Zeroing can be done via xor, but xor isn't formalized yet.
dup ventry0.2d, xzr
mov ventry1.16b, ventry0.16b
mov ventry2.16b, ventry0.16b
mov ventry3.16b, ventry0.16b
mov ventry4.16b, ventry0.16b
mov ventry5.16b, ventry0.16b
mov ventry6.16b, ventry0.16b
mov ventry7.16b, ventry0.16b

mov cnt, #0
bignum_copy_row_from_table_16_neon_loop:

// Compute mask: Check if current index matches target index
subs xzr, cnt, idx
cinv mask, xzr, eq
dup vmask.2d, mask

ldr qtmp, [tbl, #16*0]
bit ventry0.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*1]
bit ventry1.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*2]
bit ventry2.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*3]
bit ventry3.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*4]
bit ventry4.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*5]
bit ventry5.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*6]
bit ventry6.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*7]
bit ventry7.16b, vtmp.16b, vmask.16b

add tbl, tbl, #16*8

add cnt, cnt, #1
subs xzr, height, cnt
b.ne bignum_copy_row_from_table_16_neon_loop

bignum_copy_row_from_table_16_neon_end:

str qentry0, [z, #16*0]
str qentry1, [z, #16*1]
str qentry2, [z, #16*2]
str qentry3, [z, #16*3]
str qentry4, [z, #16*4]
str qentry5, [z, #16*5]
str qentry6, [z, #16*6]
str qentry7, [z, #16*7]

ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
181 changes: 181 additions & 0 deletions arm/generic/bignum_copy_row_from_table_32_neon.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1]
// into z[0..row-1].
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
//
// extern void bignum_copy_from_table_32_neon
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx);
//
// Initial version written by Hanno Becker
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_32_neon)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_32_neon)
.text
.balign 4


// *****************************************************
// Main code
// *****************************************************

#define z x0
#define tbl x1
#define height x2
#define idx x3

#define mask x5
#define cnt x6

#define ventry0 v20
#define qentry0 q20
#define ventry1 v21
#define qentry1 q21
#define ventry2 v22
#define qentry2 q22
#define ventry3 v23
#define qentry3 q23
#define ventry4 v24
#define qentry4 q24
#define ventry5 v25
#define qentry5 q25
#define ventry6 v26
#define qentry6 q26
#define ventry7 v27
#define qentry7 q27
#define ventry8 v28
#define qentry8 q28
#define ventry9 v29
#define qentry9 q29
#define ventry10 v30
#define qentry10 q30
#define ventry11 v31
#define qentry11 q31
#define ventry12 v0
#define qentry12 q0
#define ventry13 v1
#define qentry13 q1
#define ventry14 v2
#define qentry14 q2
#define ventry15 v3
#define qentry15 q3

#define vtmp v16
#define qtmp q16

#define vmask v17

S2N_BN_SYMBOL(bignum_copy_row_from_table_32_neon):

// Clear accumulator
// Zeroing can be done via xor, but xor isn't formalized yet.
dup ventry0.2d, xzr
mov ventry1.16b, ventry0.16b
mov ventry2.16b, ventry0.16b
mov ventry3.16b, ventry0.16b
mov ventry4.16b, ventry0.16b
mov ventry5.16b, ventry0.16b
mov ventry6.16b, ventry0.16b
mov ventry7.16b, ventry0.16b
mov ventry8.16b, ventry0.16b
mov ventry9.16b, ventry0.16b
mov ventry10.16b, ventry0.16b
mov ventry11.16b, ventry0.16b
mov ventry12.16b, ventry0.16b
mov ventry13.16b, ventry0.16b
mov ventry14.16b, ventry0.16b
mov ventry15.16b, ventry0.16b

mov cnt, #0
bignum_copy_row_from_table_32_neon_loop:

// Compute mask: Check if current index matches target index
subs xzr, cnt, idx
cinv mask, xzr, eq
dup vmask.2d, mask

ldr qtmp, [tbl, #16*0]
bit ventry0.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*1]
bit ventry1.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*2]
bit ventry2.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*3]
bit ventry3.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*4]
bit ventry4.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*5]
bit ventry5.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*6]
bit ventry6.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*7]
bit ventry7.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*8]
bit ventry8.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*9]
bit ventry9.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*10]
bit ventry10.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*11]
bit ventry11.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*12]
bit ventry12.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*13]
bit ventry13.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*14]
bit ventry14.16b, vtmp.16b, vmask.16b

ldr qtmp, [tbl, #16*15]
bit ventry15.16b, vtmp.16b, vmask.16b

add tbl, tbl, #32*8

add cnt, cnt, #1
subs xzr, height, cnt
b.ne bignum_copy_row_from_table_32_neon_loop

bignum_copy_row_from_table_32_neon_end:

str qentry0, [z, #16*0]
str qentry1, [z, #16*1]
str qentry2, [z, #16*2]
str qentry3, [z, #16*3]
str qentry4, [z, #16*4]
str qentry5, [z, #16*5]
str qentry6, [z, #16*6]
str qentry7, [z, #16*7]
str qentry8, [z, #16*8]
str qentry9, [z, #16*9]
str qentry10, [z, #16*10]
str qentry11, [z, #16*11]
str qentry12, [z, #16*12]
str qentry13, [z, #16*13]
str qentry14, [z, #16*14]
str qentry15, [z, #16*15]

ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
Loading

0 comments on commit 9130626

Please sign in to comment.