diff --git a/pqcrypto-internals/Cargo.toml b/pqcrypto-internals/Cargo.toml
index c38d0bc8..3d16bfda 100644
--- a/pqcrypto-internals/Cargo.toml
+++ b/pqcrypto-internals/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pqcrypto-internals"
-version = "0.2.5"
+version = "0.2.6"
 edition = "2021"
 description = "bindings to common cryptography"
 license = "MIT OR Apache-2.0"
diff --git a/pqcrypto-internals/build.rs b/pqcrypto-internals/build.rs
index 83ecf7fa..3157756c 100644
--- a/pqcrypto-internals/build.rs
+++ b/pqcrypto-internals/build.rs
@@ -33,6 +33,7 @@ fn main() {
     println!("cargo:rustc-link-lib=pqclean_common");
 
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
+    let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default();
     if target_arch == "x86" || target_arch == "x86_64" {
         let mut builder = cc::Build::new();
 
@@ -42,7 +43,6 @@ fn main() {
             builder.flag(format!("--sysroot={wasi_sdk_path}").as_str());
         }
 
-        let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
         if target_env == "msvc" {
             builder.flag("/arch:AVX2");
         } else {
@@ -56,5 +56,19 @@ fn main() {
             )
             .compile("keccak4x");
         println!("cargo:rustc-link-lib=keccak4x")
+    } else if target_arch == "aarch64" && target_env != "msvc" {
+        let mut builder = cc::Build::new();
+
+        if target_os == "wasi" {
+            let wasi_sdk_path =
+                &std::env::var("WASI_SDK_DIR").expect("missing environment variable: WASI_SDK_DIR");
+            builder.flag(format!("--sysroot={wasi_sdk_path}").as_str());
+        }
+
+        builder
+            .file(cfiledir.join("keccak2x").join("fips202x2.c"))
+            .file(cfiledir.join("keccak2x").join("feat.S"))
+            .compile("keccak2x");
+        println!("cargo:rustc-link-lib=keccak2x")
     }
 }
diff --git a/pqcrypto-internals/cfiles/keccak2x/feat.S b/pqcrypto-internals/cfiles/keccak2x/feat.S
new file mode 100644
index 00000000..6c8e60be
--- /dev/null
+++ b/pqcrypto-internals/cfiles/keccak2x/feat.S
@@ -0,0 +1,168 @@
+
+/*
+MIT License
+
+Copyright (c) 2020 Bas Westerbaan
+Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3)
+
+.macro round
+    ; Execute theta, but without xoring into the state yet.
+    ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i].
+    eor3.16b v25, v0, v5, v10
+    eor3.16b v26, v1, v6, v11
+    eor3.16b v27, v2, v7, v12
+    eor3.16b v28, v3, v8, v13
+    eor3.16b v29, v4, v9, v14
+
+    eor3.16b v25, v25, v15, v20
+    eor3.16b v26, v26, v16, v21
+    eor3.16b v27, v27, v17, v22
+    eor3.16b v28, v28, v18, v23
+    eor3.16b v29, v29, v19, v24
+
+    rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4]
+    rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2]
+    rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0]
+    rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3]
+    rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1]
+
+    ; Xor parities from step theta into the state at the same time
+    ; as executing rho and pi.
+    eor.16b v0, v0,  v30
+    mov.16b v31, v1
+    xar.2d v1,  v6,  v27, 20
+    xar.2d v6,  v9,  v25, 44
+    xar.2d v9,  v22, v28, 3
+    xar.2d v22, v14, v25, 25
+    xar.2d v14, v20, v30, 46
+    xar.2d v20, v2,  v28, 2
+    xar.2d v2,  v12, v28, 21
+    xar.2d v12, v13, v29, 39
+    xar.2d v13, v19, v25, 56
+    xar.2d v19, v23, v29, 8
+    xar.2d v23, v15, v30, 23
+    xar.2d v15, v4,  v25, 37
+    xar.2d v4,  v24, v25, 50
+    xar.2d v24, v21, v27, 62
+    xar.2d v21, v8,  v29, 9
+    xar.2d v8,  v16, v27, 19
+    xar.2d v16, v5,  v30, 28
+    xar.2d v5,  v3,  v29, 36
+    xar.2d v3,  v18, v29, 43
+    xar.2d v18, v17, v28, 49
+    xar.2d v17, v11, v27, 54
+    xar.2d v11, v7,  v28, 58
+    xar.2d v7,  v10, v30, 61
+    xar.2d v10, v31, v27, 63
+
+    ; Chi
+    bcax.16b v25, v0,  v2,  v1
+    bcax.16b v26, v1,  v3,  v2
+    bcax.16b v2,  v2,  v4,  v3
+    bcax.16b v3,  v3,  v0,  v4
+    bcax.16b v4,  v4,  v1,  v0
+    mov.16b v0, v25
+    mov.16b v1, v26
+
+    bcax.16b v25, v5,  v7,  v6
+    bcax.16b v26, v6,  v8,  v7
+    bcax.16b v7,  v7,  v9,  v8
+    bcax.16b v8,  v8,  v5,  v9
+    bcax.16b v9,  v9,  v6,  v5
+    mov.16b v5, v25
+    mov.16b v6, v26
+
+    bcax.16b v25, v10,  v12,  v11
+    bcax.16b v26, v11,  v13,  v12
+    bcax.16b v12, v12,  v14,  v13
+    bcax.16b v13, v13,  v10,  v14
+    bcax.16b v14, v14,  v11,  v10
+    mov.16b v10, v25
+    mov.16b v11, v26
+
+    bcax.16b v25, v15,  v17,  v16
+    bcax.16b v26, v16,  v18,  v17
+    bcax.16b v17, v17,  v19,  v18
+    bcax.16b v18, v18,  v15,  v19
+    bcax.16b v19, v19,  v16,  v15
+    mov.16b v15, v25
+    mov.16b v16, v26
+
+    bcax.16b v25, v20,  v22,  v21
+    bcax.16b v26, v21,  v23,  v22
+    bcax.16b v22, v22,  v24,  v23
+    bcax.16b v23, v23,  v20,  v24
+    bcax.16b v24, v24,  v21,  v20
+    mov.16b v20, v25
+    mov.16b v21, v26
+
+    ; iota
+    ld1r {v25.2d}, [x1], #8
+    eor.16b v0, v0, v25
+.endm
+
+.align 4
+.global f1600x2
+.global _f1600x2
+f1600x2:
+_f1600x2:
+    stp d8,  d9,  [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+
+    mov x2, x0
+    mov x3, #24
+
+    ld1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    ld1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    ld1.2d {v8,  v9,  v10, v11}, [x0], #64
+    ld1.2d {v12, v13, v14, v15}, [x0], #64
+    ld1.2d {v16, v17, v18, v19}, [x0], #64
+    ld1.2d {v20, v21, v22, v23}, [x0], #64
+    ld1.2d {v24}, [x0]
+
+loop:
+    round
+
+    subs x3, x3, #1
+    cbnz x3, loop
+
+    mov x0, x2
+    st1.2d {v0,  v1,  v2,  v3},  [x0], #64
+    st1.2d {v4,  v5,  v6,  v7},  [x0], #64
+    st1.2d {v8,  v9,  v10, v11}, [x0], #64
+    st1.2d {v12, v13, v14, v15}, [x0], #64
+    st1.2d {v16, v17, v18, v19}, [x0], #64
+    st1.2d {v20, v21, v22, v23}, [x0], #64
+    st1.2d {v24}, [x0]
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8,  d9,  [sp], #16
+
+    ret lr
+
+#endif
diff --git a/pqcrypto-internals/cfiles/keccak2x/fips202x2.c b/pqcrypto-internals/cfiles/keccak2x/fips202x2.c
new file mode 100644
index 00000000..d0e8efdc
--- /dev/null
+++ b/pqcrypto-internals/cfiles/keccak2x/fips202x2.c
@@ -0,0 +1,684 @@
+
+/*
+ * This file was originally licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ *
+ * We offer
+ * CC0 1.0 Universal or the following MIT License for this file.
+ * You may freely choose one of them that applies.
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "fips202x2.h"
+#include <arm_neon.h>
+
+#define NROUNDS 24
+
+// Define NEON operation
+// c = load(ptr)
+#define vload(ptr) vld1q_u64(ptr);
+// ptr <= c;
+#define vstore(ptr, c) vst1q_u64(ptr, c);
+// c = a ^ b
+#define vxor(c, a, b) c = veorq_u64(a, b);
+// Rotate by n bit ((a << offset) ^ (a >> (64-offset)))
+#define vROL(out, a, offset)    \
+    out = vshlq_n_u64(a, offset); \
+    out = vsriq_n_u64(out, a, 64 - offset);
+// Xor chain: out = a ^ b ^ c ^ d ^ e
+#define vXOR4(out, a, b, c, d, e) \
+    out = veorq_u64(a, b);          \
+    out = veorq_u64(out, c);        \
+    out = veorq_u64(out, d);        \
+    out = veorq_u64(out, e);
+// Not And c = ~a & b
+// #define vbic(c, a, b) c = vbicq_u64(b, a);
+// Xor Not And: out = a ^ ( (~b) & c)
+#define vXNA(out, a, b, c) \
+    out = vbicq_u64(c, b);   \
+    out = veorq_u64(out, a);
+// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support
+#define vrxor(c, a, b) c = vrax1q_u64(a, b);
+// End Define
+
+/* Keccak round constants */
+static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = {
+    (uint64_t)0x0000000000000001ULL,
+    (uint64_t)0x0000000000008082ULL,
+    (uint64_t)0x800000000000808aULL,
+    (uint64_t)0x8000000080008000ULL,
+    (uint64_t)0x000000000000808bULL,
+    (uint64_t)0x0000000080000001ULL,
+    (uint64_t)0x8000000080008081ULL,
+    (uint64_t)0x8000000000008009ULL,
+    (uint64_t)0x000000000000008aULL,
+    (uint64_t)0x0000000000000088ULL,
+    (uint64_t)0x0000000080008009ULL,
+    (uint64_t)0x000000008000000aULL,
+    (uint64_t)0x000000008000808bULL,
+    (uint64_t)0x800000000000008bULL,
+    (uint64_t)0x8000000000008089ULL,
+    (uint64_t)0x8000000000008003ULL,
+    (uint64_t)0x8000000000008002ULL,
+    (uint64_t)0x8000000000000080ULL,
+    (uint64_t)0x000000000000800aULL,
+    (uint64_t)0x800000008000000aULL,
+    (uint64_t)0x8000000080008081ULL,
+    (uint64_t)0x8000000000008080ULL,
+    (uint64_t)0x0000000080000001ULL,
+    (uint64_t)0x8000000080008008ULL
+};
+
+/*************************************************
+* Name:        KeccakF1600_StatePermutex2
+*
+* Description: The Keccak F1600 Permutation
+*
+* Arguments:   - uint64_t *state: pointer to input/output Keccak state
+**************************************************/
+extern void f1600x2(v128 *, const uint64_t *);
+static inline
+void KeccakF1600_StatePermutex2(v128 state[25]) {
+    #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */
+    f1600x2(state, neon_KeccakF_RoundConstants);
+    #else
+    v128 Aba, Abe, Abi, Abo, Abu;
+    v128 Aga, Age, Agi, Ago, Agu;
+    v128 Aka, Ake, Aki, Ako, Aku;
+    v128 Ama, Ame, Ami, Amo, Amu;
+    v128 Asa, Ase, Asi, Aso, Asu;
+    v128 BCa, BCe, BCi, BCo, BCu; // tmp
+    v128 Da, De, Di, Do, Du;      // D
+    v128 Eba, Ebe, Ebi, Ebo, Ebu;
+    v128 Ega, Ege, Egi, Ego, Egu;
+    v128 Eka, Eke, Eki, Eko, Eku;
+    v128 Ema, Eme, Emi, Emo, Emu;
+    v128 Esa, Ese, Esi, Eso, Esu;
+
+    //copyFromState(A, state)
+    Aba = state[0];
+    Abe = state[1];
+    Abi = state[2];
+    Abo = state[3];
+    Abu = state[4];
+    Aga = state[5];
+    Age = state[6];
+    Agi = state[7];
+    Ago = state[8];
+    Agu = state[9];
+    Aka = state[10];
+    Ake = state[11];
+    Aki = state[12];
+    Ako = state[13];
+    Aku = state[14];
+    Ama = state[15];
+    Ame = state[16];
+    Ami = state[17];
+    Amo = state[18];
+    Amu = state[19];
+    Asa = state[20];
+    Ase = state[21];
+    Asi = state[22];
+    Aso = state[23];
+    Asu = state[24];
+
+    for (int round = 0; round < NROUNDS; round += 2) {
+        //    prepareTheta
+        vXOR4(BCa, Aba, Aga, Aka, Ama, Asa);
+        vXOR4(BCe, Abe, Age, Ake, Ame, Ase);
+        vXOR4(BCi, Abi, Agi, Aki, Ami, Asi);
+        vXOR4(BCo, Abo, Ago, Ako, Amo, Aso);
+        vXOR4(BCu, Abu, Agu, Aku, Amu, Asu);
+
+        //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+        vROL(Da, BCe, 1);
+        vxor(Da, BCu, Da);
+        vROL(De, BCi, 1);
+        vxor(De, BCa, De);
+        vROL(Di, BCo, 1);
+        vxor(Di, BCe, Di);
+        vROL(Do, BCu, 1);
+        vxor(Do, BCi, Do);
+        vROL(Du, BCa, 1);
+        vxor(Du, BCo, Du);
+
+        vxor(Aba, Aba, Da);
+        vxor(Age, Age, De);
+        vROL(BCe, Age, 44);
+        vxor(Aki, Aki, Di);
+        vROL(BCi, Aki, 43);
+        vxor(Amo, Amo, Do);
+        vROL(BCo, Amo, 21);
+        vxor(Asu, Asu, Du);
+        vROL(BCu, Asu, 14);
+        vXNA(Eba, Aba, BCe, BCi);
+        vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round]));
+        vXNA(Ebe, BCe, BCi, BCo);
+        vXNA(Ebi, BCi, BCo, BCu);
+        vXNA(Ebo, BCo, BCu, Aba);
+        vXNA(Ebu, BCu, Aba, BCe);
+
+        vxor(Abo, Abo, Do);
+        vROL(BCa, Abo, 28);
+        vxor(Agu, Agu, Du);
+        vROL(BCe, Agu, 20);
+        vxor(Aka, Aka, Da);
+        vROL(BCi, Aka, 3);
+        vxor(Ame, Ame, De);
+        vROL(BCo, Ame, 45);
+        vxor(Asi, Asi, Di);
+        vROL(BCu, Asi, 61);
+        vXNA(Ega, BCa, BCe, BCi);
+        vXNA(Ege, BCe, BCi, BCo);
+        vXNA(Egi, BCi, BCo, BCu);
+        vXNA(Ego, BCo, BCu, BCa);
+        vXNA(Egu, BCu, BCa, BCe);
+
+        vxor(Abe, Abe, De);
+        vROL(BCa, Abe, 1);
+        vxor(Agi, Agi, Di);
+        vROL(BCe, Agi, 6);
+        vxor(Ako, Ako, Do);
+        vROL(BCi, Ako, 25);
+        vxor(Amu, Amu, Du);
+        vROL(BCo, Amu, 8);
+        vxor(Asa, Asa, Da);
+        vROL(BCu, Asa, 18);
+        vXNA(Eka, BCa, BCe, BCi);
+        vXNA(Eke, BCe, BCi, BCo);
+        vXNA(Eki, BCi, BCo, BCu);
+        vXNA(Eko, BCo, BCu, BCa);
+        vXNA(Eku, BCu, BCa, BCe);
+
+        vxor(Abu, Abu, Du);
+        vROL(BCa, Abu, 27);
+        vxor(Aga, Aga, Da);
+        vROL(BCe, Aga, 36);
+        vxor(Ake, Ake, De);
+        vROL(BCi, Ake, 10);
+        vxor(Ami, Ami, Di);
+        vROL(BCo, Ami, 15);
+        vxor(Aso, Aso, Do);
+        vROL(BCu, Aso, 56);
+        vXNA(Ema, BCa, BCe, BCi);
+        vXNA(Eme, BCe, BCi, BCo);
+        vXNA(Emi, BCi, BCo, BCu);
+        vXNA(Emo, BCo, BCu, BCa);
+        vXNA(Emu, BCu, BCa, BCe);
+
+        vxor(Abi, Abi, Di);
+        vROL(BCa, Abi, 62);
+        vxor(Ago, Ago, Do);
+        vROL(BCe, Ago, 55);
+        vxor(Aku, Aku, Du);
+        vROL(BCi, Aku, 39);
+        vxor(Ama, Ama, Da);
+        vROL(BCo, Ama, 41);
+        vxor(Ase, Ase, De);
+        vROL(BCu, Ase, 2);
+        vXNA(Esa, BCa, BCe, BCi);
+        vXNA(Ese, BCe, BCi, BCo);
+        vXNA(Esi, BCi, BCo, BCu);
+        vXNA(Eso, BCo, BCu, BCa);
+        vXNA(Esu, BCu, BCa, BCe);
+
+        // Next Round
+
+        //    prepareTheta
+        vXOR4(BCa, Eba, Ega, Eka, Ema, Esa);
+        vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese);
+        vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi);
+        vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso);
+        vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu);
+
+        //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+        vROL(Da, BCe, 1);
+        vxor(Da, BCu, Da);
+        vROL(De, BCi, 1);
+        vxor(De, BCa, De);
+        vROL(Di, BCo, 1);
+        vxor(Di, BCe, Di);
+        vROL(Do, BCu, 1);
+        vxor(Do, BCi, Do);
+        vROL(Du, BCa, 1);
+        vxor(Du, BCo, Du);
+
+        vxor(Eba, Eba, Da);
+        vxor(Ege, Ege, De);
+        vROL(BCe, Ege, 44);
+        vxor(Eki, Eki, Di);
+        vROL(BCi, Eki, 43);
+        vxor(Emo, Emo, Do);
+        vROL(BCo, Emo, 21);
+        vxor(Esu, Esu, Du);
+        vROL(BCu, Esu, 14);
+        vXNA(Aba, Eba, BCe, BCi);
+        vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1]));
+        vXNA(Abe, BCe, BCi, BCo);
+        vXNA(Abi, BCi, BCo, BCu);
+        vXNA(Abo, BCo, BCu, Eba);
+        vXNA(Abu, BCu, Eba, BCe);
+
+        vxor(Ebo, Ebo, Do);
+        vROL(BCa, Ebo, 28);
+        vxor(Egu, Egu, Du);
+        vROL(BCe, Egu, 20);
+        vxor(Eka, Eka, Da);
+        vROL(BCi, Eka, 3);
+        vxor(Eme, Eme, De);
+        vROL(BCo, Eme, 45);
+        vxor(Esi, Esi, Di);
+        vROL(BCu, Esi, 61);
+        vXNA(Aga, BCa, BCe, BCi);
+        vXNA(Age, BCe, BCi, BCo);
+        vXNA(Agi, BCi, BCo, BCu);
+        vXNA(Ago, BCo, BCu, BCa);
+        vXNA(Agu, BCu, BCa, BCe);
+
+        vxor(Ebe, Ebe, De);
+        vROL(BCa, Ebe, 1);
+        vxor(Egi, Egi, Di);
+        vROL(BCe, Egi, 6);
+        vxor(Eko, Eko, Do);
+        vROL(BCi, Eko, 25);
+        vxor(Emu, Emu, Du);
+        vROL(BCo, Emu, 8);
+        vxor(Esa, Esa, Da);
+        vROL(BCu, Esa, 18);
+        vXNA(Aka, BCa, BCe, BCi);
+        vXNA(Ake, BCe, BCi, BCo);
+        vXNA(Aki, BCi, BCo, BCu);
+        vXNA(Ako, BCo, BCu, BCa);
+        vXNA(Aku, BCu, BCa, BCe);
+
+        vxor(Ebu, Ebu, Du);
+        vROL(BCa, Ebu, 27);
+        vxor(Ega, Ega, Da);
+        vROL(BCe, Ega, 36);
+        vxor(Eke, Eke, De);
+        vROL(BCi, Eke, 10);
+        vxor(Emi, Emi, Di);
+        vROL(BCo, Emi, 15);
+        vxor(Eso, Eso, Do);
+        vROL(BCu, Eso, 56);
+        vXNA(Ama, BCa, BCe, BCi);
+        vXNA(Ame, BCe, BCi, BCo);
+        vXNA(Ami, BCi, BCo, BCu);
+        vXNA(Amo, BCo, BCu, BCa);
+        vXNA(Amu, BCu, BCa, BCe);
+
+        vxor(Ebi, Ebi, Di);
+        vROL(BCa, Ebi, 62);
+        vxor(Ego, Ego, Do);
+        vROL(BCe, Ego, 55);
+        vxor(Eku, Eku, Du);
+        vROL(BCi, Eku, 39);
+        vxor(Ema, Ema, Da);
+        vROL(BCo, Ema, 41);
+        vxor(Ese, Ese, De);
+        vROL(BCu, Ese, 2);
+        vXNA(Asa, BCa, BCe, BCi);
+        vXNA(Ase, BCe, BCi, BCo);
+        vXNA(Asi, BCi, BCo, BCu);
+        vXNA(Aso, BCo, BCu, BCa);
+        vXNA(Asu, BCu, BCa, BCe);
+    }
+
+    state[0] = Aba;
+    state[1] = Abe;
+    state[2] = Abi;
+    state[3] = Abo;
+    state[4] = Abu;
+    state[5] = Aga;
+    state[6] = Age;
+    state[7] = Agi;
+    state[8] = Ago;
+    state[9] = Agu;
+    state[10] = Aka;
+    state[11] = Ake;
+    state[12] = Aki;
+    state[13] = Ako;
+    state[14] = Aku;
+    state[15] = Ama;
+    state[16] = Ame;
+    state[17] = Ami;
+    state[18] = Amo;
+    state[19] = Amu;
+    state[20] = Asa;
+    state[21] = Ase;
+    state[22] = Asi;
+    state[23] = Aso;
+    state[24] = Asu;
+    #endif
+}
+
+/*************************************************
+* Name:        keccakx2_absorb
+*
+* Description: Absorb step of Keccak;
+*              non-incremental, starts by zeroeing the state.
+*
+* Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
+*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
+*              - const uint8_t *m: pointer to input to be absorbed into s
+*              - size_t mlen: length of input in bytes
+*              - uint8_t p: domain-separation byte for different
+*                           Keccak-derived functions
+**************************************************/
+static
+void keccakx2_absorb(v128 s[25],
+                     unsigned int r,
+                     const uint8_t *in0,
+                     const uint8_t *in1,
+                     size_t inlen,
+                     uint8_t p) {
+    size_t i, pos = 0;
+
+    // Declare SIMD registers
+    v128 tmp, mask;
+    uint64x1_t a, b;
+    uint64x2_t a1, b1, atmp1, btmp1;
+    uint64x2x2_t a2, b2, atmp2, btmp2;
+    // End
+
+    for (i = 0; i < 25; ++i) {
+        s[i] = vdupq_n_u64(0);
+    }
+
+    // Load in0[i] to register, then in1[i] to register, exchange them
+    while (inlen >= r) {
+        for (i = 0; i < r / 8 - 1; i += 4) {
+            a2 = vld1q_u64_x2((uint64_t *)&in0[pos]);
+            b2 = vld1q_u64_x2((uint64_t *)&in1[pos]);
+            // BD = zip1(AB and CD)
+            atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]);
+            atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]);
+            // AC = zip2(AB and CD)
+            btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]);
+            btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]);
+
+            vxor(s[i + 0], s[i + 0], atmp2.val[0]);
+            vxor(s[i + 1], s[i + 1], btmp2.val[0]);
+            vxor(s[i + 2], s[i + 2], atmp2.val[1]);
+            vxor(s[i + 3], s[i + 3], btmp2.val[1]);
+
+            pos += 8 * 2 * 2;
+        }
+        // Last iteration
+        i = r / 8 - 1;
+        a = vld1_u64((uint64_t *)&in0[pos]);
+        b = vld1_u64((uint64_t *)&in1[pos]);
+        tmp = vcombine_u64(a, b);
+        vxor(s[i], s[i], tmp);
+        pos += 8;
+
+        KeccakF1600_StatePermutex2(s);
+        inlen -= r;
+    }
+
+    i = 0;
+    while (inlen >= 16) {
+        a1 = vld1q_u64((uint64_t *)&in0[pos]);
+        b1 = vld1q_u64((uint64_t *)&in1[pos]);
+        // BD = zip1(AB and CD)
+        atmp1 = vzip1q_u64(a1, b1);
+        // AC = zip2(AB and CD)
+        btmp1 = vzip2q_u64(a1, b1);
+
+        vxor(s[i + 0], s[i + 0], atmp1);
+        vxor(s[i + 1], s[i + 1], btmp1);
+
+        i += 2;
+        pos += 8 * 2;
+        inlen -= 8 * 2;
+    }
+
+    if (inlen >= 8) {
+        a = vld1_u64((uint64_t *)&in0[pos]);
+        b = vld1_u64((uint64_t *)&in1[pos]);
+        tmp = vcombine_u64(a, b);
+        vxor(s[i], s[i], tmp);
+
+        i++;
+        pos += 8;
+        inlen -= 8;
+    }
+
+    if (inlen) {
+        a = vld1_u64((uint64_t *)&in0[pos]);
+        b = vld1_u64((uint64_t *)&in1[pos]);
+        tmp = vcombine_u64(a, b);
+        mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1);
+        tmp = vandq_u64(tmp, mask);
+        vxor(s[i], s[i], tmp);
+    }
+
+    tmp = vdupq_n_u64((uint64_t)p << (8 * inlen));
+    vxor(s[i], s[i], tmp);
+
+    mask = vdupq_n_u64(1ULL << 63);
+    vxor(s[r / 8 - 1], s[r / 8 - 1], mask);
+}
+
+/*************************************************
+* Name:        keccak_squeezeblocks
+*
+* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each.
+*              Modifies the state. Can be called multiple times to keep
+*              squeezing, i.e., is incremental.
+*
+* Arguments:   - uint8_t *out: pointer to output blocks
+*              - size_t nblocks: number of blocks to be squeezed (written to h)
+*              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
+*              - uint64_t *s: pointer to input/output Keccak state
+**************************************************/
+static
+void keccakx2_squeezeblocks(uint8_t *out0,
+                            uint8_t *out1,
+                            size_t nblocks,
+                            unsigned int r,
+                            v128 s[25]) {
+    unsigned int i;
+
+    uint64x1_t a, b;
+    uint64x2x2_t a2, b2;
+
+    while (nblocks > 0) {
+        KeccakF1600_StatePermutex2(s);
+
+        for (i = 0; i < r / 8 - 1; i += 4) {
+            a2.val[0] = vuzp1q_u64(s[i], s[i + 1]);
+            b2.val[0] = vuzp2q_u64(s[i], s[i + 1]);
+            a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]);
+            b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]);
+            vst1q_u64_x2((uint64_t *)out0, a2);
+            vst1q_u64_x2((uint64_t *)out1, b2);
+
+            out0 += 32;
+            out1 += 32;
+        }
+
+        i = r / 8 - 1;
+        // Last iteration
+        a = vget_low_u64(s[i]);
+        b = vget_high_u64(s[i]);
+        vst1_u64((uint64_t *)out0, a);
+        vst1_u64((uint64_t *)out1, b);
+
+        out0 += 8;
+        out1 += 8;
+
+        --nblocks;
+    }
+}
+
+/*************************************************
+* Name:        shake128x2_absorb
+*
+* Description: Absorb step of the SHAKE128 XOF.
+*              non-incremental, starts by zeroeing the state.
+*
+* Arguments:   - keccakx2_state *state: pointer to (uninitialized) output
+*                                     Keccak state
+*              - const uint8_t *in:   pointer to input to be absorbed into s
+*              - size_t inlen:        length of input in bytes
+**************************************************/
+void shake128x2_absorb(keccakx2_state *state,
+                       const uint8_t *in0,
+                       const uint8_t *in1,
+                       size_t inlen) {
+    keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F);
+}
+
+/*************************************************
+* Name:        shake128_squeezeblocks
+*
+* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
+*              SHAKE128_RATE bytes each. Modifies the state. Can be called
+*              multiple times to keep squeezing, i.e., is incremental.
+*
+* Arguments:   - uint8_t *out:    pointer to output blocks
+*              - size_t nblocks:  number of blocks to be squeezed
+*                                 (written to output)
+*              - keccakx2_state *s: pointer to input/output Keccak state
+**************************************************/
+void shake128x2_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              size_t nblocks,
+                              keccakx2_state *state) {
+    keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s);
+}
+
+/*************************************************
+* Name:        shake256_absorb
+*
+* Description: Absorb step of the SHAKE256 XOF.
+*              non-incremental, starts by zeroeing the state.
+*
+* Arguments:   - keccakx2_state *s:   pointer to (uninitialized) output Keccak state
+*              - const uint8_t *in: pointer to input to be absorbed into s
+*              - size_t inlen:      length of input in bytes
+**************************************************/
+void shake256x2_absorb(keccakx2_state *state,
+                       const uint8_t *in0,
+                       const uint8_t *in1,
+                       size_t inlen) {
+    keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F);
+}
+
+/*************************************************
+* Name:        shake256_squeezeblocks
+*
+* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of
+*              SHAKE256_RATE bytes each. Modifies the state. Can be called
+*              multiple times to keep squeezing, i.e., is incremental.
+*
+* Arguments:   - uint8_t *out:    pointer to output blocks
+*              - size_t nblocks:  number of blocks to be squeezed
+*                                 (written to output)
+*              - keccakx2_state *s: pointer to input/output Keccak state
+**************************************************/
+void shake256x2_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              size_t nblocks,
+                              keccakx2_state *state) {
+    keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s);
+}
+
+/*************************************************
+* Name:        shake128
+*
+* Description: SHAKE128 XOF with non-incremental API
+*
+* Arguments:   - uint8_t *out:      pointer to output
+*              - size_t outlen:     requested output length in bytes
+*              - const uint8_t *in: pointer to input
+*              - size_t inlen:      length of input in bytes
+**************************************************/
+void shake128x2(uint8_t *out0,
+                uint8_t *out1,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                size_t inlen) {
+    unsigned int i;
+    size_t nblocks = outlen / SHAKE128_RATE;
+    uint8_t t[2][SHAKE128_RATE];
+    keccakx2_state state;
+
+    shake128x2_absorb(&state, in0, in1, inlen);
+    shake128x2_squeezeblocks(out0, out1, nblocks, &state);
+
+    out0 += nblocks * SHAKE128_RATE;
+    out1 += nblocks * SHAKE128_RATE;
+    outlen -= nblocks * SHAKE128_RATE;
+
+    if (outlen) {
+        shake128x2_squeezeblocks(t[0], t[1], 1, &state);
+        for (i = 0; i < outlen; ++i) {
+            out0[i] = t[0][i];
+            out1[i] = t[1][i];
+        }
+    }
+}
+
+/*************************************************
+* Name:        shake256
+*
+* Description: SHAKE256 XOF with non-incremental API
+*
+* Arguments:   - uint8_t *out:      pointer to output
+*              - size_t outlen:     requested output length in bytes
+*              - const uint8_t *in: pointer to input
+*              - size_t inlen:      length of input in bytes
+**************************************************/
+void shake256x2(uint8_t *out0,
+                uint8_t *out1,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                size_t inlen) {
+    unsigned int i;
+    size_t nblocks = outlen / SHAKE256_RATE;
+    uint8_t t[2][SHAKE256_RATE];
+    keccakx2_state state;
+
+    shake256x2_absorb(&state, in0, in1, inlen);
+    shake256x2_squeezeblocks(out0, out1, nblocks, &state);
+
+    out0 += nblocks * SHAKE256_RATE;
+    out1 += nblocks * SHAKE256_RATE;
+    outlen -= nblocks * SHAKE256_RATE;
+
+    if (outlen) {
+        shake256x2_squeezeblocks(t[0], t[1], 1, &state);
+        for (i = 0; i < outlen; ++i) {
+            out0[i] = t[0][i];
+            out1[i] = t[1][i];
+        }
+    }
+}
diff --git a/pqcrypto-internals/cfiles/keccak2x/fips202x2.h b/pqcrypto-internals/cfiles/keccak2x/fips202x2.h
new file mode 100644
index 00000000..1274a6d2
--- /dev/null
+++ b/pqcrypto-internals/cfiles/keccak2x/fips202x2.h
@@ -0,0 +1,60 @@
+#ifndef FIPS202X2_H
+#define FIPS202X2_H
+
+/*
+ * This file is licensed
+ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html)
+ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or
+ * public domain at https://github.com/cothan/kyber/blob/master/neon
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <arm_neon.h>
+
+typedef uint64x2_t v128;
+
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+#define SHA3_256_RATE 136
+#define SHA3_512_RATE 72
+
+typedef struct {
+    v128 s[25];
+} keccakx2_state;
+
+void shake128x2_absorb(keccakx2_state *state,
+                       const uint8_t *in0,
+                       const uint8_t *in1,
+                       size_t inlen);
+
+void shake128x2_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              size_t nblocks,
+                              keccakx2_state *state);
+
+void shake256x2_absorb(keccakx2_state *state,
+                       const uint8_t *in0,
+                       const uint8_t *in1,
+                       size_t inlen);
+
+void shake256x2_squeezeblocks(uint8_t *out0,
+                              uint8_t *out1,
+                              size_t nblocks,
+                              keccakx2_state *state);
+
+void shake128x2(uint8_t *out0,
+                uint8_t *out1,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                size_t inlen);
+
+void shake256x2(uint8_t *out0,
+                uint8_t *out1,
+                size_t outlen,
+                const uint8_t *in0,
+                const uint8_t *in1,
+                size_t inlen);
+
+#endif