diff --git a/pqcrypto-internals/Cargo.toml b/pqcrypto-internals/Cargo.toml index c38d0bc8..3d16bfda 100644 --- a/pqcrypto-internals/Cargo.toml +++ b/pqcrypto-internals/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pqcrypto-internals" -version = "0.2.5" +version = "0.2.6" edition = "2021" description = "bindings to common cryptography" license = "MIT OR Apache-2.0" diff --git a/pqcrypto-internals/build.rs b/pqcrypto-internals/build.rs index 83ecf7fa..3157756c 100644 --- a/pqcrypto-internals/build.rs +++ b/pqcrypto-internals/build.rs @@ -33,6 +33,7 @@ fn main() { println!("cargo:rustc-link-lib=pqclean_common"); let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); + let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default(); if target_arch == "x86" || target_arch == "x86_64" { let mut builder = cc::Build::new(); @@ -42,7 +43,6 @@ fn main() { builder.flag(format!("--sysroot={wasi_sdk_path}").as_str()); } - let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); if target_env == "msvc" { builder.flag("/arch:AVX2"); } else { @@ -56,5 +56,19 @@ fn main() { ) .compile("keccak4x"); println!("cargo:rustc-link-lib=keccak4x") + } else if target_arch == "aarch64" && target_env != "msvc" { + let mut builder = cc::Build::new(); + + if target_os == "wasi" { + let wasi_sdk_path = + &std::env::var("WASI_SDK_DIR").expect("missing environment variable: WASI_SDK_DIR"); + builder.flag(format!("--sysroot={wasi_sdk_path}").as_str()); + } + + builder + .file(cfiledir.join("keccak2x").join("fips202x2.c")) + .file(cfiledir.join("keccak2x").join("feat.S")) + .compile("keccak2x"); + println!("cargo:rustc-link-lib=keccak2x") } } diff --git a/pqcrypto-internals/cfiles/keccak2x/feat.S b/pqcrypto-internals/cfiles/keccak2x/feat.S new file mode 100644 index 00000000..6c8e60be --- /dev/null +++ b/pqcrypto-internals/cfiles/keccak2x/feat.S @@ -0,0 +1,168 @@ + +/* +MIT License + +Copyright (c) 2020 Bas Westerbaan +Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) + +.macro round + ; Execute theta, but without xoring into the state yet. + ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. + eor3.16b v25, v0, v5, v10 + eor3.16b v26, v1, v6, v11 + eor3.16b v27, v2, v7, v12 + eor3.16b v28, v3, v8, v13 + eor3.16b v29, v4, v9, v14 + + eor3.16b v25, v25, v15, v20 + eor3.16b v26, v26, v16, v21 + eor3.16b v27, v27, v17, v22 + eor3.16b v28, v28, v18, v23 + eor3.16b v29, v29, v19, v24 + + rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] + rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] + rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] + rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] + rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] + + ; Xor parities from step theta into the state at the same time + ; as executing rho and pi. + eor.16b v0, v0, v30 + mov.16b v31, v1 + xar.2d v1, v6, v27, 20 + xar.2d v6, v9, v25, 44 + xar.2d v9, v22, v28, 3 + xar.2d v22, v14, v25, 25 + xar.2d v14, v20, v30, 46 + xar.2d v20, v2, v28, 2 + xar.2d v2, v12, v28, 21 + xar.2d v12, v13, v29, 39 + xar.2d v13, v19, v25, 56 + xar.2d v19, v23, v29, 8 + xar.2d v23, v15, v30, 23 + xar.2d v15, v4, v25, 37 + xar.2d v4, v24, v25, 50 + xar.2d v24, v21, v27, 62 + xar.2d v21, v8, v29, 9 + xar.2d v8, v16, v27, 19 + xar.2d v16, v5, v30, 28 + xar.2d v5, v3, v29, 36 + xar.2d v3, v18, v29, 43 + xar.2d v18, v17, v28, 49 + xar.2d v17, v11, v27, 54 + xar.2d v11, v7, v28, 58 + xar.2d v7, v10, v30, 61 + xar.2d v10, v31, v27, 63 + + ; Chi + bcax.16b v25, v0, v2, v1 + bcax.16b v26, v1, v3, v2 + bcax.16b v2, v2, v4, v3 + bcax.16b v3, v3, v0, v4 + bcax.16b v4, v4, v1, v0 + mov.16b v0, v25 + mov.16b v1, v26 + + bcax.16b v25, v5, v7, v6 + bcax.16b v26, v6, v8, v7 + bcax.16b v7, v7, v9, v8 + bcax.16b v8, v8, v5, v9 + bcax.16b v9, v9, v6, v5 + mov.16b v5, v25 + mov.16b v6, v26 + + bcax.16b v25, v10, v12, v11 + bcax.16b v26, v11, v13, v12 + bcax.16b v12, v12, v14, v13 + bcax.16b v13, v13, v10, v14 + bcax.16b v14, v14, v11, v10 + mov.16b v10, v25 + mov.16b v11, v26 + + bcax.16b v25, v15, v17, v16 + bcax.16b v26, v16, v18, v17 + bcax.16b v17, v17, v19, v18 + bcax.16b v18, v18, v15, v19 + bcax.16b v19, v19, v16, v15 + mov.16b v15, v25 + mov.16b v16, v26 + + bcax.16b v25, v20, v22, v21 + bcax.16b v26, v21, v23, v22 + bcax.16b v22, v22, v24, v23 + bcax.16b v23, v23, v20, v24 + bcax.16b v24, v24, v21, v20 + mov.16b v20, v25 + mov.16b v21, v26 + + ; iota + ld1r {v25.2d}, [x1], #8 + eor.16b v0, v0, v25 +.endm + +.align 4 +.global f1600x2 +.global _f1600x2 +f1600x2: +_f1600x2: + stp d8, d9, [sp,#-16]! + stp d10, d11, [sp,#-16]! + stp d12, d13, [sp,#-16]! + stp d14, d15, [sp,#-16]! + + mov x2, x0 + mov x3, #24 + + ld1.2d {v0, v1, v2, v3}, [x0], #64 + ld1.2d {v4, v5, v6, v7}, [x0], #64 + ld1.2d {v8, v9, v10, v11}, [x0], #64 + ld1.2d {v12, v13, v14, v15}, [x0], #64 + ld1.2d {v16, v17, v18, v19}, [x0], #64 + ld1.2d {v20, v21, v22, v23}, [x0], #64 + ld1.2d {v24}, [x0] + +loop: + round + + subs x3, x3, #1 + cbnz x3, loop + + mov x0, x2 + st1.2d {v0, v1, v2, v3}, [x0], #64 + st1.2d {v4, v5, v6, v7}, [x0], #64 + st1.2d {v8, v9, v10, v11}, [x0], #64 + st1.2d {v12, v13, v14, v15}, [x0], #64 + st1.2d {v16, v17, v18, v19}, [x0], #64 + st1.2d {v20, v21, v22, v23}, [x0], #64 + st1.2d {v24}, [x0] + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ret lr + +#endif diff --git a/pqcrypto-internals/cfiles/keccak2x/fips202x2.c b/pqcrypto-internals/cfiles/keccak2x/fips202x2.c new file mode 100644 index 00000000..d0e8efdc --- /dev/null +++ b/pqcrypto-internals/cfiles/keccak2x/fips202x2.c @@ -0,0 +1,684 @@ + +/* + * This file was originally licensed + * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) + * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or + * public domain at https://github.com/cothan/kyber/blob/master/neon + * + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. + * + * MIT License + * + * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "fips202x2.h" +#include + +#define NROUNDS 24 + +// Define NEON operation +// c = load(ptr) +#define vload(ptr) vld1q_u64(ptr); +// ptr <= c; +#define vstore(ptr, c) vst1q_u64(ptr, c); +// c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); +// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) +#define vROL(out, a, offset) \ + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR4(out, a, b, c, d, e) \ + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); +// Not And c = ~a & b +// #define vbic(c, a, b) c = vbicq_u64(b, a); +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); +// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support +#define vrxor(c, a, b) c = vrax1q_u64(a, b); +// End Define + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/************************************************* +* Name: KeccakF1600_StatePermutex2 +* +* Description: The Keccak F1600 Permutation +* +* Arguments: - uint64_t *state: pointer to input/output Keccak state +**************************************************/ +extern void f1600x2(v128 *, const uint64_t *); +static inline +void KeccakF1600_StatePermutex2(v128 state[25]) { + #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ + f1600x2(state, neon_KeccakF_RoundConstants); + #else + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (int round = 0; round < NROUNDS; round += 2) { + // prepareTheta + vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR4(BCe, Abe, Age, Ake, Ame, Ase); + vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Aba, Aba, Da); + vxor(Age, Age, De); + vROL(BCe, Age, 44); + vxor(Aki, Aki, Di); + vROL(BCi, Aki, 43); + vxor(Amo, Amo, Do); + vROL(BCo, Amo, 21); + vxor(Asu, Asu, Du); + vROL(BCu, Asu, 14); + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vxor(Abo, Abo, Do); + vROL(BCa, Abo, 28); + vxor(Agu, Agu, Du); + vROL(BCe, Agu, 20); + vxor(Aka, Aka, Da); + vROL(BCi, Aka, 3); + vxor(Ame, Ame, De); + vROL(BCo, Ame, 45); + vxor(Asi, Asi, Di); + vROL(BCu, Asi, 61); + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vxor(Abe, Abe, De); + vROL(BCa, Abe, 1); + vxor(Agi, Agi, Di); + vROL(BCe, Agi, 6); + vxor(Ako, Ako, Do); + vROL(BCi, Ako, 25); + vxor(Amu, Amu, Du); + vROL(BCo, Amu, 8); + vxor(Asa, Asa, Da); + vROL(BCu, Asa, 18); + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vxor(Abu, Abu, Du); + vROL(BCa, Abu, 27); + vxor(Aga, Aga, Da); + vROL(BCe, Aga, 36); + vxor(Ake, Ake, De); + vROL(BCi, Ake, 10); + vxor(Ami, Ami, Di); + vROL(BCo, Ami, 15); + vxor(Aso, Aso, Do); + vROL(BCu, Aso, 56); + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vxor(Abi, Abi, Di); + vROL(BCa, Abi, 62); + vxor(Ago, Ago, Do); + vROL(BCe, Ago, 55); + vxor(Aku, Aku, Du); + vROL(BCi, Aku, 39); + vxor(Ama, Ama, Da); + vROL(BCo, Ama, 41); + vxor(Ase, Ase, De); + vROL(BCu, Ase, 2); + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Eba, Eba, Da); + vxor(Ege, Ege, De); + vROL(BCe, Ege, 44); + vxor(Eki, Eki, Di); + vROL(BCi, Eki, 43); + vxor(Emo, Emo, Do); + vROL(BCo, Emo, 21); + vxor(Esu, Esu, Du); + vROL(BCu, Esu, 14); + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vxor(Ebo, Ebo, Do); + vROL(BCa, Ebo, 28); + vxor(Egu, Egu, Du); + vROL(BCe, Egu, 20); + vxor(Eka, Eka, Da); + vROL(BCi, Eka, 3); + vxor(Eme, Eme, De); + vROL(BCo, Eme, 45); + vxor(Esi, Esi, Di); + vROL(BCu, Esi, 61); + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vxor(Ebe, Ebe, De); + vROL(BCa, Ebe, 1); + vxor(Egi, Egi, Di); + vROL(BCe, Egi, 6); + vxor(Eko, Eko, Do); + vROL(BCi, Eko, 25); + vxor(Emu, Emu, Du); + vROL(BCo, Emu, 8); + vxor(Esa, Esa, Da); + vROL(BCu, Esa, 18); + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vxor(Ebu, Ebu, Du); + vROL(BCa, Ebu, 27); + vxor(Ega, Ega, Da); + vROL(BCe, Ega, 36); + vxor(Eke, Eke, De); + vROL(BCi, Eke, 10); + vxor(Emi, Emi, Di); + vROL(BCo, Emi, 15); + vxor(Eso, Eso, Do); + vROL(BCu, Eso, 56); + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vxor(Ebi, Ebi, Di); + vROL(BCa, Ebi, 62); + vxor(Ego, Ego, Do); + vROL(BCe, Ego, 55); + vxor(Eku, Eku, Du); + vROL(BCi, Eku, 39); + vxor(Ema, Ema, Da); + vROL(BCo, Ema, 41); + vxor(Ese, Ese, De); + vROL(BCu, Ese, 2); + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + #endif +} + +/************************************************* +* Name: keccakx2_absorb +* +* Description: Absorb step of Keccak; +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - const uint8_t *m: pointer to input to be absorbed into s +* - size_t mlen: length of input in bytes +* - uint8_t p: domain-separation byte for different +* Keccak-derived functions +**************************************************/ +static +void keccakx2_absorb(v128 s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen, + uint8_t p) { + size_t i, pos = 0; + + // Declare SIMD registers + v128 tmp, mask; + uint64x1_t a, b; + uint64x2_t a1, b1, atmp1, btmp1; + uint64x2x2_t a2, b2, atmp2, btmp2; + // End + + for (i = 0; i < 25; ++i) { + s[i] = vdupq_n_u64(0); + } + + // Load in0[i] to register, then in1[i] to register, exchange them + while (inlen >= r) { + for (i = 0; i < r / 8 - 1; i += 4) { + a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); + b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); + atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); + // AC = zip2(AB and CD) + btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); + btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); + + vxor(s[i + 0], s[i + 0], atmp2.val[0]); + vxor(s[i + 1], s[i + 1], btmp2.val[0]); + vxor(s[i + 2], s[i + 2], atmp2.val[1]); + vxor(s[i + 3], s[i + 3], btmp2.val[1]); + + pos += 8 * 2 * 2; + } + // Last iteration + i = r / 8 - 1; + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + pos += 8; + + KeccakF1600_StatePermutex2(s); + inlen -= r; + } + + i = 0; + while (inlen >= 16) { + a1 = vld1q_u64((uint64_t *)&in0[pos]); + b1 = vld1q_u64((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp1 = vzip1q_u64(a1, b1); + // AC = zip2(AB and CD) + btmp1 = vzip2q_u64(a1, b1); + + vxor(s[i + 0], s[i + 0], atmp1); + vxor(s[i + 1], s[i + 1], btmp1); + + i += 2; + pos += 8 * 2; + inlen -= 8 * 2; + } + + if (inlen >= 8) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + + i++; + pos += 8; + inlen -= 8; + } + + if (inlen) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); + tmp = vandq_u64(tmp, mask); + vxor(s[i], s[i], tmp); + } + + tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); + vxor(s[i], s[i], tmp); + + mask = vdupq_n_u64(1ULL << 63); + vxor(s[r / 8 - 1], s[r / 8 - 1], mask); +} + +/************************************************* +* Name: keccak_squeezeblocks +* +* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. +* Modifies the state. Can be called multiple times to keep +* squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed (written to h) +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - uint64_t *s: pointer to input/output Keccak state +**************************************************/ +static +void keccakx2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + unsigned int r, + v128 s[25]) { + unsigned int i; + + uint64x1_t a, b; + uint64x2x2_t a2, b2; + + while (nblocks > 0) { + KeccakF1600_StatePermutex2(s); + + for (i = 0; i < r / 8 - 1; i += 4) { + a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); + b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); + a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); + b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); + vst1q_u64_x2((uint64_t *)out0, a2); + vst1q_u64_x2((uint64_t *)out1, b2); + + out0 += 32; + out1 += 32; + } + + i = r / 8 - 1; + // Last iteration + a = vget_low_u64(s[i]); + b = vget_high_u64(s[i]); + vst1_u64((uint64_t *)out0, a); + vst1_u64((uint64_t *)out1, b); + + out0 += 8; + out1 += 8; + + --nblocks; + } +} + +/************************************************* +* Name: shake128x2_absorb +* +* Description: Absorb step of the SHAKE128 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of +* SHAKE128_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); +} + +/************************************************* +* Name: shake256_absorb +* +* Description: Absorb step of the SHAKE256 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake256_squeezeblocks +* +* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of +* SHAKE256_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); +} + +/************************************************* +* Name: shake128 +* +* Description: SHAKE128 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[2][SHAKE128_RATE]; + keccakx2_state state; + + shake128x2_absorb(&state, in0, in1, inlen); + shake128x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} + +/************************************************* +* Name: shake256 +* +* Description: SHAKE256 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[2][SHAKE256_RATE]; + keccakx2_state state; + + shake256x2_absorb(&state, in0, in1, inlen); + shake256x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} diff --git a/pqcrypto-internals/cfiles/keccak2x/fips202x2.h b/pqcrypto-internals/cfiles/keccak2x/fips202x2.h new file mode 100644 index 00000000..1274a6d2 --- /dev/null +++ b/pqcrypto-internals/cfiles/keccak2x/fips202x2.h @@ -0,0 +1,60 @@ +#ifndef FIPS202X2_H +#define FIPS202X2_H + +/* + * This file is licensed + * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) + * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or + * public domain at https://github.com/cothan/kyber/blob/master/neon + */ + +#include +#include +#include + +typedef uint64x2_t v128; + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +typedef struct { + v128 s[25]; +} keccakx2_state; + +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#endif