diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
index 1c4a474a256..4bf9fee32e1 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -1291,6 +1291,24 @@ class MacroAssembler: public Assembler {
                        Register zlen, Register tmp1, Register tmp2, Register tmp3,
                        Register tmp4, Register tmp5, Register tmp6, Register tmp7);
   void mul_add(Register out, Register in, Register offs, Register len, Register k);
+  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                      FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                      FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
+  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                    FloatRegister p, FloatRegister z, FloatRegister t1);
+  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
+                                Register data, Register blocks, int unrolls);
+  void ghash_modmul (FloatRegister result,
+                     FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                     FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                     FloatRegister t1, FloatRegister t2, FloatRegister t3);
+
+  void aesenc_loadkeys(Register key, Register keylen);
+  void aesecb_encrypt(Register from, Register to, Register keylen,
+                      FloatRegister data = v0, int unrolls = 1);
+  void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
+  void aes_round(FloatRegister input, FloatRegister subkey);
+
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }
 
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
new file mode 100644
index 00000000000..588ef67d7ad
--- /dev/null
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_aarch64.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/stubRoutines.hpp"
+
+void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
+  Label L_doLast;
+
+  ld1(v0, T16B, from); // get 16 bytes of input
+
+  ld1(v5, T16B, post(key, 16));
+  rev32(v5, T16B, v5);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 44);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 52);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  bind(L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+
+  eor(v0, T16B, v0, v5);
+
+  st1(v0, T16B, to);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// Load expanded key into v17..v31
+void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
+  Label L_loadkeys_44, L_loadkeys_52;
+  cmpw(keylen, 52);
+  br(Assembler::LO, L_loadkeys_44);
+  br(Assembler::EQ, L_loadkeys_52);
+
+  ld1(v17, v18,  T16B,  post(key, 32));
+  rev32(v17,  T16B, v17);
+  rev32(v18,  T16B, v18);
+  bind(L_loadkeys_52);
+  ld1(v19, v20,  T16B,  post(key, 32));
+  rev32(v19,  T16B, v19);
+  rev32(v20,  T16B, v20);
+  bind(L_loadkeys_44);
+  ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
+  rev32(v21,  T16B, v21);
+  rev32(v22,  T16B, v22);
+  rev32(v23,  T16B, v23);
+  rev32(v24,  T16B, v24);
+  ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
+  rev32(v25,  T16B, v25);
+  rev32(v26,  T16B, v26);
+  rev32(v27,  T16B, v27);
+  rev32(v28,  T16B, v28);
+  ld1(v29, v30, v31,  T16B, post(key, 48));
+  rev32(v29,  T16B, v29);
+  rev32(v30,  T16B, v30);
+  rev32(v31,  T16B, v31);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// NeoverseTM N1Software Optimization Guide:
+// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
+// instruction pairs will exhibit the performance characteristics
+// described in Section 4.6.
+void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
+  aese(input, subkey); aesmc(input, input);
+}
+
+// KernelGenerator
+//
+// The abstract base class of an unrolled function generator.
+// Subclasses override generate(), length(), and next() to generate
+// unrolled and interleaved functions.
+//
+// The core idea is that a subclass defines a method which generates
+// the base case of a function and a method to generate a clone of it,
+// shifted to a different set of registers. KernelGenerator will then
+// generate several interleaved copies of the function, with each one
+// using a different set of registers.
+
+// The subclass must implement three methods: length(), which is the
+// number of instruction bundles in the intrinsic, generate(int n)
+// which emits the nth instruction bundle in the intrinsic, and next()
+// which takes an instance of the generator and returns a version of it,
+// shifted to a new set of registers.
+
+class KernelGenerator: public MacroAssembler {
+protected:
+  const int _unrolls;
+public:
+  KernelGenerator(Assembler *as, int unrolls)
+    : MacroAssembler(as->code()), _unrolls(unrolls) { }
+  virtual void generate(int index) = 0;
+  virtual int length() = 0;
+  virtual KernelGenerator *next() = 0;
+  int unrolls() { return _unrolls; }
+  void unroll();
+};
+
+void KernelGenerator::unroll() {
+  ResourceMark rm;
+  KernelGenerator **generators
+    = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
+
+  generators[0] = this;
+  for (int i = 1; i < unrolls(); i++) {
+    generators[i] = generators[i-1]->next();
+  }
+
+  for (int j = 0; j < length(); j++) {
+    for (int i = 0; i < unrolls(); i++) {
+      generators[i]->generate(j);
+    }
+  }
+}
+
+// An unrolled and interleaved generator for AES encryption.
+class AESKernelGenerator: public KernelGenerator {
+  Register _from, _to;
+  const Register _keylen;
+  FloatRegister _data;
+  const FloatRegister _subkeys;
+  bool _once;
+  Label _rounds_44, _rounds_52;
+
+public:
+  AESKernelGenerator(Assembler *as, int unrolls,
+                     Register from, Register to, Register keylen, FloatRegister data,
+                     FloatRegister subkeys, bool once = true)
+    : KernelGenerator(as, unrolls),
+      _from(from), _to(to), _keylen(keylen), _data(data),
+      _subkeys(subkeys), _once(once) {
+  }
+
+  virtual void generate(int index) {
+    switch (index) {
+    case  0:
+      if (_from != noreg) {
+        ld1(_data, T16B, _from); // get 16 bytes of input
+      }
+      break;
+    case  1:
+      if (_once) {
+        cmpw(_keylen, 52);
+        br(Assembler::LO, _rounds_44);
+        br(Assembler::EQ, _rounds_52);
+      }
+      break;
+    case  2:  aes_round(_data, _subkeys +  0);  break;
+    case  3:  aes_round(_data, _subkeys +  1);  break;
+    case  4:
+      if (_once)  bind(_rounds_52);
+      break;
+    case  5:  aes_round(_data, _subkeys +  2);  break;
+    case  6:  aes_round(_data, _subkeys +  3);  break;
+    case  7:
+      if (_once)  bind(_rounds_44);
+      break;
+    case  8:  aes_round(_data, _subkeys +  4);  break;
+    case  9:  aes_round(_data, _subkeys +  5);  break;
+    case 10:  aes_round(_data, _subkeys +  6);  break;
+    case 11:  aes_round(_data, _subkeys +  7);  break;
+    case 12:  aes_round(_data, _subkeys +  8);  break;
+    case 13:  aes_round(_data, _subkeys +  9);  break;
+    case 14:  aes_round(_data, _subkeys + 10);  break;
+    case 15:  aes_round(_data, _subkeys + 11);  break;
+    case 16:  aes_round(_data, _subkeys + 12);  break;
+    case 17:  aese(_data, _subkeys + 13);  break;
+    case 18:  eor(_data, T16B, _data, _subkeys + 14);  break;
+    case 19:
+      if (_to != noreg) {
+        st1(_data, T16B, _to);
+      }
+      break;
+    default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    return new AESKernelGenerator(this, _unrolls,
+                                  _from, _to, _keylen,
+                                  _data + 1, _subkeys, /*once*/false);
+  }
+
+  virtual int length() { return 20; }
+};
+
+// Uses expanded key in v17..v31
+// Returns encrypted values in inputs.
+// If to != noreg, store value at to; likewise from
+// Preserves key, keylen
+// Increments from, to
+// Input data in v0, v1, ...
+// unrolls controls the number of times to unroll the generated function
+void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
+                                    FloatRegister data, int unrolls) {
+  AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
+}
+
+// ghash_multiply and ghash_reduce are the non-unrolled versions of
+// the GHASH function generators.
+void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                                     FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                                     FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
+  // Karatsuba multiplication performs a 128*128 -> 256-bit
+  // multiplication in three 128-bit multiplications and a few
+  // additions.
+  //
+  // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+  // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+  //
+  // Inputs:
+  //
+  // A0 in a.d[0]     (subkey)
+  // A1 in a.d[1]
+  // (A1+A0) in a1_xor_a0.d[0]
+  //
+  // B0 in b.d[0]     (state)
+  // B1 in b.d[1]
+
+  ext(tmp1, T16B, b, b, 0x08);
+  pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
+  eor(tmp1, T16B, tmp1, b);           // (B1+B0)
+  pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
+  pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+
+  ext(tmp1, T16B, result_lo, result_hi, 0x08);
+  eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
+  eor(tmp2, T16B, tmp2, tmp1);
+  eor(tmp2, T16B, tmp2, tmp3);
+
+  // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
+  ins(result_hi, D, tmp2, 0, 1);
+  ins(result_lo, D, tmp2, 1, 0);
+}
+
+void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                  FloatRegister p, FloatRegister vzr, FloatRegister t1) {
+  const FloatRegister t0 = result;
+
+  // The GCM field polynomial f is z^128 + p(z), where p =
+  // z^7+z^2+z+1.
+  //
+  //    z^128 === -p(z)  (mod (z^128 + p(z)))
+  //
+  // so, given that the product we're reducing is
+  //    a == lo + hi * z^128
+  // substituting,
+  //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+  //
+  // we reduce by multiplying hi by p(z) and subtracting the result
+  // from (i.e. XORing it with) lo.  Because p has no nonzero high
+  // bits we can do this with two 64-bit multiplications, lo*p and
+  // hi*p.
+
+  pmull2(t0, T1Q, hi, p, T2D);
+  ext(t1, T16B, t0, vzr, 8);
+  eor(hi, T16B, hi, t1);
+  ext(t1, T16B, vzr, t0, 8);
+  eor(lo, T16B, lo, t1);
+  pmull(t0, T1Q, hi, p, T1D);
+  eor(result, T16B, lo, t0);
+}
+
+class GHASHMultiplyGenerator: public KernelGenerator {
+  FloatRegister _result_lo, _result_hi, _b,
+    _a, _vzr, _a1_xor_a0, _p,
+    _tmp1, _tmp2, _tmp3;
+
+public:
+  GHASHMultiplyGenerator(Assembler *as, int unrolls,
+                         FloatRegister result_lo, FloatRegister result_hi,
+                         /* offsetted registers */
+                         FloatRegister b,
+                         /* non-offsetted (shared) registers */
+                         FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
+                         /* offseted (temp) registers */
+                         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
+    : KernelGenerator(as, unrolls),
+      _result_lo(result_lo), _result_hi(result_hi), _b(b),
+      _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
+      _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
+
+  static const int register_stride = 7;
+
+  virtual void generate(int index) {
+    // Karatsuba multiplication performs a 128*128 -> 256-bit
+    // multiplication in three 128-bit multiplications and a few
+    // additions.
+    //
+    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+    //
+    // Inputs:
+    //
+    // A0 in a.d[0]     (subkey)
+    // A1 in a.d[1]
+    // (A1+A0) in a1_xor_a0.d[0]
+    //
+    // B0 in b.d[0]     (state)
+    // B1 in b.d[1]
+
+    switch (index) {
+      case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
+      case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
+        break;
+      case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
+        break;
+      case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
+        break;
+      case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+        break;
+
+      case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
+      case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
+        break;
+      case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
+      case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;
+
+        // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
+      case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
+      case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
+      default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHMultiplyGenerator *result
+      = new GHASHMultiplyGenerator(this, _unrolls, _result_lo, _result_hi,
+                                   _b, _a, _a1_xor_a0, _p, _vzr,
+                                   _tmp1, _tmp2, _tmp3);
+    result->_result_lo += register_stride;
+    result->_result_hi += register_stride;
+    result->_b += register_stride;
+    result->_tmp1 += register_stride;
+    result->_tmp2 += register_stride;
+    result->_tmp3 += register_stride;
+    return result;
+  }
+
+  virtual int length() { return 11; }
+};
+
+// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
+// The FloatRegister argument called data is optional: if it is a
+// valid register, we interleave LD1 instructions with the
+// reduction. This is to reduce latency next time around the loop.
+class GHASHReduceGenerator: public KernelGenerator {
+  FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
+  int _once;
+public:
+  GHASHReduceGenerator(Assembler *as, int unrolls,
+                       /* offsetted registers */
+                       FloatRegister result, FloatRegister lo, FloatRegister hi,
+                       /* non-offsetted (shared) registers */
+                       FloatRegister p, FloatRegister vzr, FloatRegister data,
+                       /* offseted (temp) registers */
+                       FloatRegister t1)
+    : KernelGenerator(as, unrolls),
+      _result(result), _lo(lo), _hi(hi),
+      _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
+
+  static const int register_stride = 7;
+
+  virtual void generate(int index) {
+    const FloatRegister t0 = _result;
+
+    switch (index) {
+      // The GCM field polynomial f is z^128 + p(z), where p =
+      // z^7+z^2+z+1.
+      //
+      //    z^128 === -p(z)  (mod (z^128 + p(z)))
+      //
+      // so, given that the product we're reducing is
+      //    a == lo + hi * z^128
+      // substituting,
+      //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+      //
+      // we reduce by multiplying hi by p(z) and subtracting the _result
+      // from (i.e. XORing it with) lo.  Because p has no nonzero high
+      // bits we can do this with two 64-bit multiplications, lo*p and
+      // hi*p.
+
+      case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
+      case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
+      case  2:  eor(_hi, T16B, _hi, _t1);  break;
+      case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
+      case  4:  eor(_lo, T16B, _lo, _t1);  break;
+      case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
+      case  6:  eor(_result, T16B, _lo, t0);  break;
+      default: ShouldNotReachHere();
+    }
+
+    // Sprinkle load instructions into the generated instructions
+    if (_data->is_valid() && _once) {
+      assert(length() >= unrolls(), "not enough room for inteleaved loads");
+      if (index < unrolls()) {
+        ld1((_data + index*register_stride), T16B, post(r2, 0x10));
+      }
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHReduceGenerator *result
+      = new GHASHReduceGenerator(this, _unrolls,
+                                 _result, _lo, _hi, _p, _vzr, _data, _t1);
+    result->_result += register_stride;
+    result->_hi += register_stride;
+    result->_lo += register_stride;
+    result->_t1 += register_stride;
+    result->_once = false;
+    return result;
+  }
+
+ int length() { return 7; }
+};
+
+// Perform a GHASH multiply/reduce on a single FloatRegister.
+void MacroAssembler::ghash_modmul(FloatRegister result,
+                                  FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                                  FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                                  FloatRegister t1, FloatRegister t2, FloatRegister t3) {
+  ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
+  ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
+}
+
+// Interleaved GHASH processing.
+//
+// Clobbers all vector registers.
+//
+void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
+                                              Register subkeyH,
+                                              Register data, Register blocks, int unrolls) {
+  int register_stride = 7;
+
+  // Bafflingly, GCM uses little-endian for the byte order, but
+  // big-endian for the bit order.  For example, the polynomial 1 is
+  // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
+  //
+  // So, we must either reverse the bytes in each word and do
+  // everything big-endian or reverse the bits in each byte and do
+  // it little-endian.  On AArch64 it's more idiomatic to reverse
+  // the bits in each byte (we have an instruction, RBIT, to do
+  // that) and keep the data in little-endian bit order throught the
+  // calculation, bit-reversing the inputs and outputs.
+
+  assert(unrolls * register_stride < 32, "out of registers");
+
+  FloatRegister a1_xor_a0 = v28;
+  FloatRegister Hprime = v29;
+  FloatRegister vzr = v30;
+  FloatRegister p = v31;
+  eor(vzr, T16B, vzr, vzr); // zero register
+
+  ldrq(p, field_polynomial);    // The field polynomial
+
+  ldrq(v0, Address(state));
+  ldrq(Hprime, Address(subkeyH));
+
+  rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
+  rbit(v0, T16B, v0);
+  rev64(Hprime, T16B, Hprime);
+  rbit(Hprime, T16B, Hprime);
+
+  // Powers of H -> Hprime
+
+  Label already_calculated, done;
+  {
+    // The first time around we'll have to calculate H**2, H**3, etc.
+    // Look at the largest power of H in the subkeyH array to see if
+    // it's already been calculated.
+    ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
+    orr(rscratch1, rscratch1, rscratch2);
+    cbnz(rscratch1, already_calculated);
+
+    orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
+    for (int i = 1; i < unrolls; i++) {
+      ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+      eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+      ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
+                   Hprime, vzr, a1_xor_a0, p,
+                   /*temps*/v1, v3, v2);
+      rev64(v1, T16B, v6);
+      rbit(v1, T16B, v1);
+      strq(v1, Address(subkeyH, 16 * i));
+    }
+    b(done);
+  }
+  {
+    bind(already_calculated);
+
+    // Load the largest power of H we need into v6.
+    ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
+    rev64(v6, T16B, v6);
+    rbit(v6, T16B, v6);
+  }
+  bind(done);
+
+  orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime
+
+  // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
+  // v0 contains the initial state. Clear the others.
+  for (int i = 1; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register
+  }
+
+  ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+  eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+
+  // Load #unrolls blocks of data
+  for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+    ld1(v2+ofs, T16B, post(data, 0x10));
+  }
+
+  // Register assignments, replicated across 4 clones, v0 ... v23
+  //
+  // v0: input / output: current state, result of multiply/reduce
+  // v1: temp
+  // v2: input: one block of data (the ciphertext)
+  //     also used as a temp once the data has been consumed
+  // v3: temp
+  // v4: output: high part of product
+  // v5: output: low part ...
+  // v6: unused
+  //
+  // Not replicated:
+  //
+  // v28: High part of H xor low part of H'
+  // v29: H' (hash subkey)
+  // v30: zero
+  // v31: Reduction polynomial of the Galois field
+
+  // Inner loop.
+  // Do the whole load/add/multiply/reduce over all our data except
+  // the last few rows.
+  {
+    Label L_ghash_loop;
+    bind(L_ghash_loop);
+
+    // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
+    // prfm(Address(data, 128), PLDL1KEEP);
+
+    // Xor data into current state
+    for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+      rbit((v2+ofs), T16B, (v2+ofs));
+      eor((v2+ofs), T16B, v0+ofs, (v2+ofs));   // bit-swapped data ^ bit-swapped state
+    }
+
+    // Generate fully-unrolled multiply-reduce in two stages.
+
+    (new GHASHMultiplyGenerator(this, unrolls,
+                                /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
+                                Hprime, a1_xor_a0, p, vzr,
+                                /*temps*/v1, v3, /* reuse b*/v2))->unroll();
+
+    // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
+    // data into v0, v0+ofs, the current state.
+    (new GHASHReduceGenerator (this, unrolls,
+                               /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
+                               /*data*/v2, /*temp*/v3))->unroll();
+
+    sub(blocks, blocks, unrolls);
+    cmp(blocks, (unsigned char)(unrolls * 2));
+    br(GE, L_ghash_loop);
+  }
+
+  // Merge the #unrolls states.  Note that the data for the next
+  // iteration has already been loaded into v4, v4+ofs, etc...
+
+  // First, we multiply/reduce each clone by the appropriate power of H.
+  for (int i = 0; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
+
+    rbit(v2+ofs, T16B, v2+ofs);
+    eor(v2+ofs, T16B, ofs+v0, v2+ofs);   // bit-swapped data ^ bit-swapped state
+
+    rev64(Hprime, T16B, Hprime);
+    rbit(Hprime, T16B, Hprime);
+    ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+    eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+    ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs,
+                 Hprime, vzr, a1_xor_a0, p,
+                 /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs);
+  }
+
+  // Then we sum the results.
+  for (int i = 0; i < unrolls - 1; i++) {
+    int ofs = register_stride * i;
+    eor(v0, T16B, v0, v0 + register_stride + ofs);
+  }
+
+  sub(blocks, blocks, (unsigned char)unrolls);
+
+  // And finally bit-reverse the state back to big endian.
+  rev64(v0, T16B, v0);
+  rbit(v0, T16B, v0);
+  st1(v0, T16B, state);
+}
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
index 1a41f00bb91..e828464acdf 100644
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -2882,6 +2882,265 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+  // CTR AES crypt.
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   c_rarg4   - input length
+  //   c_rarg5   - saved encryptedCounter start
+  //   c_rarg6   - saved used length
+  //
+  // Output:
+  //   r0       - input length
+  //
+  address generate_counterMode_AESCrypt() {
+    const Register in = c_rarg0;
+    const Register out = c_rarg1;
+    const Register key = c_rarg2;
+    const Register counter = c_rarg3;
+    const Register saved_len = c_rarg4, len = r10;
+    const Register saved_encrypted_ctr = c_rarg5;
+    const Register used_ptr = c_rarg6, used = r12;
+
+    const Register offset = r7;
+    const Register keylen = r11;
+
+    const unsigned char block_size = 16;
+    const int bulk_width = 4;
+    // NB: bulk_width can be 4 or 8. 8 gives slightly faster
+    // performance with larger data sizes, but it also means that the
+    // fast path isn't used until you have at least 8 blocks, and up
+    // to 127 bytes of data will be executed on the slow path. For
+    // that reason, and also so as not to blow away too much icache, 4
+    // blocks seems like a sensible compromise.
+
+    // Algorithm:
+    //
+    //    if (len == 0) {
+    //        goto DONE;
+    //    }
+    //    int result = len;
+    //    do {
+    //        if (used >= blockSize) {
+    //            if (len >= bulk_width * blockSize) {
+    //                CTR_large_block();
+    //                if (len == 0)
+    //                    goto DONE;
+    //            }
+    //            for (;;) {
+    //                16ByteVector v0 = counter;
+    //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
+    //                used = 0;
+    //                if (len < blockSize)
+    //                    break;    /* goto NEXT */
+    //                16ByteVector v1 = load16Bytes(in, offset);
+    //                v1 = v1 ^ encryptedCounter;
+    //                store16Bytes(out, offset);
+    //                used = blockSize;
+    //                offset += blockSize;
+    //                len -= blockSize;
+    //                if (len == 0)
+    //                    goto DONE;
+    //            }
+    //        }
+    //      NEXT:
+    //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
+    //        len--;
+    //    } while (len != 0);
+    //  DONE:
+    //    return result;
+    //
+    // CTR_large_block()
+    //    Wide bulk encryption of whole blocks.
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    const address start = __ pc();
+    __ enter();
+
+    Label DONE, CTR_large_block, large_block_return;
+    __ ldrw(used, Address(used_ptr));
+    __ cbzw(saved_len, DONE);
+
+    __ mov(len, saved_len);
+    __ mov(offset, 0);
+
+    // Compute #rounds for AES based on the length of the key array
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ aesenc_loadkeys(key, keylen);
+
+    {
+      Label L_CTR_loop, NEXT;
+
+      __ bind(L_CTR_loop);
+
+      __ cmp(used, block_size);
+      __ br(__ LO, NEXT);
+
+      // Maybe we have a lot of data
+      __ subsw(rscratch1, len, bulk_width * block_size);
+      __ br(__ HS, CTR_large_block);
+      __ BIND(large_block_return);
+      __ cbzw(len, DONE);
+
+      // Setup the counter
+      __ movi(v4, __ T4S, 0);
+      __ movi(v5, __ T4S, 1);
+      __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
+
+      __ ld1(v0, __ T16B, counter); // Load the counter into v0
+      __ rev32(v16, __ T16B, v0);
+      __ addv(v16, __ T4S, v16, v4);
+      __ rev32(v16, __ T16B, v16);
+      __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+      {
+        // We have fewer than bulk_width blocks of data left. Encrypt
+        // them one by one until there is less than a full block
+        // remaining, being careful to save both the encrypted counter
+        // and the counter.
+
+        Label inner_loop;
+        __ bind(inner_loop);
+        // Counter to encrypt is in v0
+        __ aesecb_encrypt(noreg, noreg, keylen);
+        __ st1(v0, __ T16B, saved_encrypted_ctr);
+
+        // Do we have a remaining full block?
+
+        __ mov(used, 0);
+        __ cmp(len, block_size);
+        __ br(__ LO, NEXT);
+
+        // Yes, we have a full block
+        __ ldrq(v1, Address(in, offset));
+        __ eor(v1, __ T16B, v1, v0);
+        __ strq(v1, Address(out, offset));
+        __ mov(used, block_size);
+        __ add(offset, offset, block_size);
+
+        __ subw(len, len, block_size);
+        __ cbzw(len, DONE);
+
+        // Increment the counter, store it back
+        __ orr(v0, __ T16B, v16, v16);
+        __ rev32(v16, __ T16B, v16);
+        __ addv(v16, __ T4S, v16, v4);
+        __ rev32(v16, __ T16B, v16);
+        __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+        __ b(inner_loop);
+      }
+
+      __ BIND(NEXT);
+
+      // Encrypt a single byte, and loop.
+      // We expect this to be a rare event.
+      __ ldrb(rscratch1, Address(in, offset));
+      __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
+      __ eor(rscratch1, rscratch1, rscratch2);
+      __ strb(rscratch1, Address(out, offset));
+      __ add(offset, offset, 1);
+      __ add(used, used, 1);
+      __ subw(len, len,1);
+      __ cbnzw(len, L_CTR_loop);
+    }
+
+    __ bind(DONE);
+    __ strw(used, Address(used_ptr));
+    __ mov(r0, saved_len);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    // Bulk encryption
+
+    __ BIND (CTR_large_block);
+    assert(bulk_width == 4 || bulk_width == 8, "must be");
+
+    if (bulk_width == 8) {
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+    }
+    __ sub(sp, sp, 4 * 16);
+    __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+    RegSet saved_regs = (RegSet::of(in, out, offset)
+                         + RegSet::of(saved_encrypted_ctr, used_ptr, len));
+    __ push(saved_regs, sp);
+    __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
+    __ add(in, in, offset);
+    __ add(out, out, offset);
+
+    // Keys should already be loaded into the correct registers
+
+    __ ld1(v0, __ T16B, counter); // v0 contains the first counter
+    __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
+
+    // AES/CTR loop
+    {
+      Label L_CTR_loop;
+      __ BIND(L_CTR_loop);
+
+      // Setup the counters
+      __ movi(v8, __ T4S, 0);
+      __ movi(v9, __ T4S, 1);
+      __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
+
+      for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
+        __ rev32(f, __ T16B, v16);
+        __ addv(v16, __ T4S, v16, v8);
+      }
+
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
+
+      // Encrypt the counters
+      __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
+
+      if (bulk_width == 8) {
+        __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
+      }
+
+      // XOR the encrypted counters with the inputs
+      for (int i = 0; i < bulk_width; i++) {
+        __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
+      }
+
+      // Write the encrypted data
+      __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
+      if (bulk_width == 8) {
+        __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
+      }
+
+      __ subw(len, len, 16 * bulk_width);
+      __ cbnzw(len, L_CTR_loop);
+    }
+
+    // Save the counter back where it goes
+    __ rev32(v16, __ T16B, v16);
+    __ st1(v16, __ T16B, counter);
+
+    __ pop(saved_regs, sp);
+
+    __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+    if (bulk_width == 8) {
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+    }
+
+    __ andr(rscratch1, len, -16 * bulk_width);
+    __ sub(len, len, rscratch1);
+    __ add(offset, offset, rscratch1);
+    __ mov(used, 16);
+    __ strw(used, Address(used_ptr));
+    __ b(large_block_return);
+
+    return start;
+  }
+
   // Arguments:
   //
   // Inputs:
@@ -4688,6 +4947,55 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+  address generate_ghash_processBlocks_wide() {
+    address small = generate_ghash_processBlocks();
+
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
+    __ align(wordSize * 2);
+    address p = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
+    __ align(CodeEntryAlignment);
+    address start = __ pc();
+
+    Register state   = c_rarg0;
+    Register subkeyH = c_rarg1;
+    Register data    = c_rarg2;
+    Register blocks  = c_rarg3;
+
+    const int unroll = 4;
+
+    __ cmp(blocks, (unsigned char)(unroll * 2));
+    __ br(__ LT, small);
+
+    if (unroll > 1) {
+    // Save state before entering routine
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+    }
+
+    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
+
+    if (unroll > 1) {
+      // And restore state
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+    }
+
+    __ cmp(blocks, 0u);
+    __ br(__ GT, small);
+
+    __ ret(lr);
+
+    return start;
+  }
+
 #ifdef LINUX
 
   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
@@ -6050,7 +6358,11 @@ class StubGenerator: public StubCodeGenerator {
 
     // generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
-      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+      if (UseAESCTRIntrinsics) {
+        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
+      } else {
+        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+      }
     }
 
     if (UseBASE64Intrinsics) {
@@ -6064,6 +6376,10 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    if (UseAESCTRIntrinsics) {
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
+    }
+
     if (UseSHA1Intrinsics) {
       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
diff --git a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
index 4e5930b5f94..fe350d87824 100644
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
@@ -36,7 +36,7 @@ static bool    returns_to_call_stub(address return_pc)   {
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 28000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };
 
 class aarch64 {
diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
index ae440c47c1b..0136f24f4da 100644
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@@ -216,6 +216,9 @@ void VM_Version::initialize() {
       warning("UseAESIntrinsics enabled, but UseAES not, enabling");
       UseAES = true;
     }
+    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
   } else {
     if (UseAES) {
       warning("AES instructions are not available on this CPU");
@@ -225,11 +228,10 @@ void VM_Version::initialize() {
       warning("AES intrinsics are not available on this CPU");
       FLAG_SET_DEFAULT(UseAESIntrinsics, false);
     }
-  }
-
-  if (UseAESCTRIntrinsics) {
-    warning("AES/CTR intrinsics are not available on this CPU");
-    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    if (UseAESCTRIntrinsics) {
+      warning("AES/CTR intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
   }
 
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {