-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #81 from aqjune-aws/tablelookup
Add bignum_copy_row_from_table and its Neon-variants for AArch64
- Loading branch information
Showing
16 changed files
with
2,964 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 OR ISC | ||
|
||
// ---------------------------------------------------------------------------- | ||
// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] | ||
// into z[0..width-1]. | ||
// This function is constant-time with respect to the value of `idx`. This is | ||
// achieved by reading the whole table and using the bit-masking to get the | ||
// `idx`-th row. | ||
// | ||
// extern void bignum_copy_from_table | ||
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, | ||
// uint64_t idx); | ||
// | ||
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx | ||
// ---------------------------------------------------------------------------- | ||
#include "_internal_s2n_bignum.h" | ||
|
||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table) | ||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table) | ||
.text | ||
.balign 4 | ||
|
||
#define z x0 | ||
#define table x1 | ||
#define height x2 | ||
#define width x3 | ||
#define idx x4 | ||
|
||
#define i x5 | ||
#define mask x6 | ||
#define j x7 | ||
|
||
S2N_BN_SYMBOL(bignum_copy_row_from_table): | ||
|
||
cbz height, bignum_copy_row_from_table_end | ||
cbz width, bignum_copy_row_from_table_end | ||
mov i, width | ||
mov x6, z | ||
|
||
bignum_copy_row_from_table_initzero: | ||
str xzr, [x6] | ||
add x6, x6, #8 | ||
subs i, i, #1 | ||
bne bignum_copy_row_from_table_initzero | ||
|
||
mov i, xzr | ||
mov x8, table | ||
|
||
bignum_copy_row_from_table_outerloop: | ||
|
||
cmp i, idx | ||
csetm mask, eq | ||
|
||
mov j, width | ||
mov x9, z | ||
|
||
bignum_copy_row_from_table_innerloop: | ||
|
||
ldr x10, [x8] | ||
ldr x11, [x9] | ||
and x10, x10, mask | ||
orr x11, x11, x10 | ||
str x11, [x9] | ||
|
||
add x8, x8, #8 | ||
add x9, x9, #8 | ||
subs j, j, #1 | ||
bne bignum_copy_row_from_table_innerloop | ||
|
||
bignum_copy_row_from_table_innerloop_done: | ||
add i, i, #1 | ||
cmp i, height | ||
bne bignum_copy_row_from_table_outerloop | ||
|
||
bignum_copy_row_from_table_end: | ||
ret | ||
|
||
#if defined(__linux__) && defined(__ELF__) | ||
.section .note.GNU-stack,"",%progbits | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 OR ISC | ||
|
||
// ---------------------------------------------------------------------------- | ||
// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] | ||
// into z[0..row-1]. | ||
// This function is constant-time with respect to the value of `idx`. This is | ||
// achieved by reading the whole table and using the bit-masking to get the | ||
// `idx`-th row. | ||
// | ||
// extern void bignum_copy_from_table_16_neon | ||
// (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); | ||
// | ||
// Initial version written by Hanno Becker | ||
// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx | ||
// ---------------------------------------------------------------------------- | ||
#include "_internal_s2n_bignum.h" | ||
|
||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16_neon) | ||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16_neon) | ||
.text | ||
.balign 4 | ||
|
||
|
||
// ***************************************************** | ||
// Main code | ||
// ***************************************************** | ||
|
||
#define z x0 | ||
#define tbl x1 | ||
#define height x2 | ||
#define idx x3 | ||
|
||
#define mask x5 | ||
#define cnt x6 | ||
|
||
#define ventry0 v20 | ||
#define qentry0 q20 | ||
#define ventry1 v21 | ||
#define qentry1 q21 | ||
#define ventry2 v22 | ||
#define qentry2 q22 | ||
#define ventry3 v23 | ||
#define qentry3 q23 | ||
#define ventry4 v24 | ||
#define qentry4 q24 | ||
#define ventry5 v25 | ||
#define qentry5 q25 | ||
#define ventry6 v26 | ||
#define qentry6 q26 | ||
#define ventry7 v27 | ||
#define qentry7 q27 | ||
#define ventry8 v28 | ||
|
||
#define vtmp v16 | ||
#define qtmp q16 | ||
|
||
#define vmask v17 | ||
|
||
S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon): | ||
|
||
// Clear accumulator | ||
// Zeroing can be done via xor, but xor isn't formalized yet. | ||
dup ventry0.2d, xzr | ||
mov ventry1.16b, ventry0.16b | ||
mov ventry2.16b, ventry0.16b | ||
mov ventry3.16b, ventry0.16b | ||
mov ventry4.16b, ventry0.16b | ||
mov ventry5.16b, ventry0.16b | ||
mov ventry6.16b, ventry0.16b | ||
mov ventry7.16b, ventry0.16b | ||
|
||
mov cnt, #0 | ||
bignum_copy_row_from_table_16_neon_loop: | ||
|
||
// Compute mask: Check if current index matches target index | ||
subs xzr, cnt, idx | ||
cinv mask, xzr, eq | ||
dup vmask.2d, mask | ||
|
||
ldr qtmp, [tbl, #16*0] | ||
bit ventry0.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*1] | ||
bit ventry1.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*2] | ||
bit ventry2.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*3] | ||
bit ventry3.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*4] | ||
bit ventry4.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*5] | ||
bit ventry5.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*6] | ||
bit ventry6.16b, vtmp.16b, vmask.16b | ||
|
||
ldr qtmp, [tbl, #16*7] | ||
bit ventry7.16b, vtmp.16b, vmask.16b | ||
|
||
add tbl, tbl, #16*8 | ||
|
||
add cnt, cnt, #1 | ||
subs xzr, height, cnt | ||
b.ne bignum_copy_row_from_table_16_neon_loop | ||
|
||
bignum_copy_row_from_table_16_neon_end: | ||
|
||
str qentry0, [z, #16*0] | ||
str qentry1, [z, #16*1] | ||
str qentry2, [z, #16*2] | ||
str qentry3, [z, #16*3] | ||
str qentry4, [z, #16*4] | ||
str qentry5, [z, #16*5] | ||
str qentry6, [z, #16*6] | ||
str qentry7, [z, #16*7] | ||
|
||
ret | ||
|
||
#if defined(__linux__) && defined(__ELF__) | ||
.section .note.GNU-stack,"",%progbits | ||
#endif |
Oops, something went wrong.