diff --git a/build.json b/build.json index 4d3fb4ecef..71489c9dfa 100644 --- a/build.json +++ b/build.json @@ -141,6 +141,7 @@ "perlasm_x86_64": [ {"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"}, {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx10-x86_64.pl"}, + {"src": "crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl"}, {"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"}, {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"}, {"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"}, diff --git a/crypto/crypto.cc b/crypto/crypto.cc index 912a993c26..ac0928f0eb 100644 --- a/crypto/crypto.cc +++ b/crypto/crypto.cc @@ -54,7 +54,7 @@ static_assert(sizeof(ossl_ssize_t) == sizeof(size_t), // archive, linking on OS X will fail to resolve common symbols. By // initialising it to zero, it becomes a "data symbol", which isn't so // affected. -HIDDEN uint8_t BORINGSSL_function_hit[8] = {0}; +HIDDEN uint8_t BORINGSSL_function_hit[9] = {0}; #endif #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) diff --git a/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl new file mode 100644 index 0000000000..6ea956bc8e --- /dev/null +++ b/crypto/fipsmodule/modes/asm/aes-gcm-avx2-x86_64.pl @@ -0,0 +1,1027 @@ +#!/usr/bin/env perl +# Copyright 2024 The BoringSSL Authors +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +#------------------------------------------------------------------------------ +# +# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version) +# +# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512 +# / AVX10. This means it can only use 16 vector registers instead of 32, the +# maximum vector length is 32 bytes, and some instructions such as vpternlogd +# and masked loads/stores are unavailable. However, it is able to run on CPUs +# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan" +# server processors) and some Intel client CPUs such as Alder Lake. +# +# This implementation also uses Karatsuba multiplication instead of schoolbook +# multiplication for GHASH in its main loop. This does not help much on Intel, +# but it improves performance by ~5% on AMD Zen 3 which is the main target for +# this implementation. Other factors weighing slightly in favor of Karatsuba +# multiplication in this implementation are the lower maximum vector length +# (which means there is space left in the Htable array to cache the halves of +# the key powers XOR'd together) and the unavailability of the vpternlogd +# instruction (which helped schoolbook a bit more than Karatsuba). + +use strict; + +my $flavour = shift; +my $output = shift; +if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } + +my $win64; +my @argregs; +if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { + $win64 = 1; + @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); +} +else { + $win64 = 0; + @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); +} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir = $1; +my $xlate; +( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) + or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) + or die "can't locate x86_64-xlate.pl"; + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT = *OUT; + +my $g_cur_func_name; +my $g_cur_func_uses_seh; +my @g_cur_func_saved_gpregs; +my @g_cur_func_saved_xmmregs; + +sub _begin_func { + my ( $funcname, $uses_seh ) = @_; + $g_cur_func_name = $funcname; + $g_cur_func_uses_seh = $uses_seh; + @g_cur_func_saved_gpregs = (); + @g_cur_func_saved_xmmregs = (); + return <<___; +.globl $funcname +.type $funcname,\@abi-omnipotent +.align 32 +$funcname: + .cfi_startproc + @{[ $uses_seh ? ".seh_startproc" : "" ]} + _CET_ENDBR +___ +} + +# Push a list of general purpose registers onto the stack. +sub _save_gpregs { + my @gpregs = @_; + my $code = ""; + die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; + die "_save_gpregs can only be called once per function" + if @g_cur_func_saved_gpregs; + die "Order must be _save_gpregs, then _save_xmmregs" + if @g_cur_func_saved_xmmregs; + @g_cur_func_saved_gpregs = @gpregs; + for my $reg (@gpregs) { + $code .= "push $reg\n"; + if ($win64) { + $code .= ".seh_pushreg $reg\n"; + } + else { + $code .= ".cfi_push $reg\n"; + } + } + return $code; +} + +# Push a list of xmm registers onto the stack if the target is Windows. +sub _save_xmmregs { + my @xmmregs = @_; + my $num_xmmregs = scalar @xmmregs; + my $code = ""; + die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; + die "_save_xmmregs can only be called once per function" + if @g_cur_func_saved_xmmregs; + if ( $win64 and $num_xmmregs > 0 ) { + @g_cur_func_saved_xmmregs = @xmmregs; + my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; + my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); + $code .= "sub \$$alloc_size, %rsp\n"; + $code .= ".seh_stackalloc $alloc_size\n"; + for my $i ( 0 .. $num_xmmregs - 1 ) { + my $reg_num = $xmmregs[$i]; + my $pos = 16 * $i; + $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; + $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; + } + } + return $code; +} + +sub _end_func { + my $code = ""; + + # Restore any xmm registers that were saved earlier. + my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; + if ( $win64 and $num_xmmregs > 0 ) { + my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; + my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); + for my $i ( 0 .. $num_xmmregs - 1 ) { + my $reg_num = $g_cur_func_saved_xmmregs[$i]; + my $pos = 16 * $i; + $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; + } + $code .= "add \$$alloc_size, %rsp\n"; + } + + # Restore any general purpose registers that were saved earlier. + for my $reg ( reverse @g_cur_func_saved_gpregs ) { + $code .= "pop $reg\n"; + if ( !$win64 ) { + $code .= ".cfi_pop $reg\n"; + } + } + + $code .= <<___; + ret + @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} + .cfi_endproc + .size $g_cur_func_name, . - $g_cur_func_name +___ + return $code; +} + +my $code = <<___; +.section .rodata +.align 16 + + # A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + # This is the GHASH reducing polynomial without its constant term, i.e. + # x^128 + x^7 + x^2 + x, represented using the backwards mapping + # between bits and polynomial coefficients. + # + # Alternatively, it can be interpreted as the naturally-ordered + # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + # "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .quad 1, 0xc200000000000000 + + # Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .quad 1, 0xc200000000000001 + +.align 32 + # The below constants are used for incrementing the counter blocks. +.Lctr_pattern: + .quad 0, 0 + .quad 1, 0 +.Linc_2blocks: + .quad 2, 0 + .quad 2, 0 + +.text +___ + +# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the +# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) +# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. +my $NUM_H_POWERS = 8; +my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; +my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS; + +# Offset to 'rounds' in AES_KEY struct +my $OFFSETOF_AES_ROUNDS = 240; + +# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +# the reduced products in \dst. Uses schoolbook multiplication. +sub _ghash_mul { + my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; + return <<___; + vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L + vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H + vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L + vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1 + vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57) + vpshufd \$0x4e, $t0, $t0 # Swap halves of LO + vpxor $t0, $t1, $t1 # Fold LO into MI (part 1) + vpxor $t2, $t1, $t1 # Fold LO into MI (part 2) + vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H + vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57) + vpshufd \$0x4e, $t1, $t1 # Swap halves of MI + vpxor $t1, $dst, $dst # Fold MI into HI (part 1) + vpxor $t0, $dst, $dst # Fold MI into HI (part 2) +___ +} + +# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); +# +# Initialize |Htable| with powers of the GHASH subkey |H|. +# +# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the +# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) +# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. +$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1; +{ + my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; + my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" ); + my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" ); + + $code .= <<___; + @{[ _save_xmmregs (6) ]} + .seh_endprologue + + # Load the byte-reflected hash subkey. BoringSSL provides it in + # byte-reflected form except the two halves are in the wrong order. + vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM + + # Finish preprocessing the byte-reflected hash subkey by multiplying it by + # x^-1 ("standard" interpretation of polynomial coefficients) or + # equivalently x^1 (natural interpretation). This gets the key into a + # format that avoids having to bit-reflect the data blocks later. + vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM + vpsrad \$31, $TMP0_XMM, $TMP0_XMM + vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM + vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM + vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM + + vbroadcasti128 .Lgfpoly(%rip), $GFPOLY + + # Square H^1 to get H^2. + @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, + $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]} + + # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. + vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR + vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC + + # Compute H_CUR2 = [H^4, H^3]. + @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + + # Store [H^2, H^1] and [H^4, H^3]. + vmovdqu $H_CUR, 3*32($HTABLE) + vmovdqu $H_CUR2, 2*32($HTABLE) + + # For Karatsuba multiplication: compute and store the two 64-bit halves of + # each key power XOR'd together. Order is 4,2,3,1. + vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 + vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 + vpxor $TMP1, $TMP0, $TMP0 + vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE) + + # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. + @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} + vmovdqu $H_CUR, 1*32($HTABLE) + vmovdqu $H_CUR2, 0*32($HTABLE) + + # Again, compute and store the two 64-bit halves of each key power XOR'd + # together. Order is 8,6,7,5. + vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 + vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 + vpxor $TMP1, $TMP0, $TMP0 + vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE) + + vzeroupper +___ +} +$code .= _end_func; + +# Do one step of the GHASH update of four vectors of data blocks. +# $i: the step to do, 0 through 9 +# $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) +# $htable: pointer to the Htable for the key +# $bswap_mask: mask for reflecting the bytes of blocks +# $h_pow[2-1]_xored: XOR'd key powers cached from Htable +# $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps. +# $lo, $mi: working state for this macro that must be preserved across steps +# $ghash_acc: the GHASH accumulator (input/output) +sub _ghash_step_4x { + my ( + $i, $ghashdata_ptr, $htable, $bswap_mask, + $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm, + $tmp1, $tmp2, $lo, $mi, + $ghash_acc, $ghash_acc_xmm + ) = @_; + my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias + if ( $i == 0 ) { + return <<___; + # First vector + vmovdqu 0*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 0*32($htable), $tmp2 + vpxor $ghash_acc, $tmp1, $tmp1 + vpclmulqdq \$0x00, $tmp2, $tmp1, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi +___ + } + elsif ( $i == 1 ) { + return <<___; +___ + } + elsif ( $i == 2 ) { + return <<___; + # Second vector + vmovdqu 1*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 1*32($htable), $tmp2 + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 3 ) { + return <<___; + # Third vector + vmovdqu 2*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 + vmovdqu 2*32($htable), $tmp2 +___ + } + elsif ( $i == 4 ) { + return <<___; + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi +___ + } + elsif ( $i == 5 ) { + return <<___; + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi + + # Fourth vector + vmovdqu 3*32($ghashdata_ptr), $tmp1 + vpshufb $bswap_mask, $tmp1, $tmp1 +___ + } + elsif ( $i == 6 ) { + return <<___; + vmovdqu 3*32($htable), $tmp2 + vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $lo, $lo + vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 + vpxor $tmp0, $hi, $hi + vpunpckhqdq $tmp1, $tmp1, $tmp0 + vpxor $tmp1, $tmp0, $tmp0 + vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0 + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 7 ) { + return <<___; + # Finalize 'mi' following Karatsuba multiplication. + vpxor $lo, $mi, $mi + vpxor $hi, $mi, $mi + + # Fold lo into mi. + vbroadcasti128 .Lgfpoly(%rip), $tmp2 + vpclmulqdq \$0x01, $lo, $tmp2, $tmp0 + vpshufd \$0x4e, $lo, $lo + vpxor $lo, $mi, $mi + vpxor $tmp0, $mi, $mi +___ + } + elsif ( $i == 8 ) { + return <<___; + # Fold mi into hi. + vpclmulqdq \$0x01, $mi, $tmp2, $tmp0 + vpshufd \$0x4e, $mi, $mi + vpxor $mi, $hi, $hi + vpxor $tmp0, $hi, $hi +___ + } + elsif ( $i == 9 ) { + return <<___; + vextracti128 \$1, $hi, $tmp0_xmm + vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm +___ + } +} + +sub _ghash_4x { + my $code = ""; + for my $i ( 0 .. 9 ) { + $code .= _ghash_step_4x $i, @_; + } + return $code; +} + +# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); +$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; +{ + my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; + my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = + map( "%xmm$_", ( 0 .. 6 ) ); + + $code .= <<___; + @{[ _save_xmmregs (6) ]} + .seh_endprologue + + vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC + vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK + vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 + vmovdqu .Lgfpoly(%rip), $GFPOLY + vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC + + @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]} + + vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC + vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR) +___ +} +$code .= _end_func; + +# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], +# const uint8_t *in, size_t len); +# +# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given +# by |in| and |len|. |len| must be a multiple of 16. +# +# This function handles large amounts of AAD efficiently, while also keeping the +# overhead low for small amounts of AAD which is the common case. TLS uses less +# than one block of AAD, but (uncommonly) other use cases may use much more. +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1; +{ + # Function arguments + my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; + + # Additional local variables + my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" ); + my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" ); + my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" ); + my $H_POW2_XORED = "%ymm8"; + my $H_POW1_XORED = "%ymm9"; + + $code .= <<___; + @{[ _save_xmmregs (6 .. 9) ]} + .seh_endprologue + + vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK + vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vbroadcasti128 .Lgfpoly(%rip), $GFPOLY + + # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128. + cmp \$32, $AADLEN + jb .Lghash_lastblock + + cmp \$127, $AADLEN + jbe .Lghash_loop_1x + + # Update GHASH with 128 bytes of AAD at a time. + vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED + vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED +.Lghash_loop_4x: + @{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED, + $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC, + $GHASH_ACC_XMM ]} + sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte + add \$-128, $AADLEN + cmp \$127, $AADLEN + ja .Lghash_loop_4x + + # Update GHASH with 32 bytes of AAD at a time. + cmp \$32, $AADLEN + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu ($AAD), $TMP0 + vpshufb $BSWAP_MASK, $TMP0, $TMP0 + vpxor $TMP0, $GHASH_ACC, $GHASH_ACC + vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0 + @{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]} + vextracti128 \$1, $GHASH_ACC, $TMP0_XMM + vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + add \$32, $AAD + sub \$32, $AADLEN + cmp \$32, $AADLEN + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + # Issue the vzeroupper that is needed after using ymm registers. Do it here + # instead of at the end, to minimize overhead for small AADLEN. + vzeroupper + + # Update GHASH with the remaining 16-byte block if any. +.Lghash_lastblock: + test $AADLEN, $AADLEN + jz .Lghash_done + vmovdqu ($AAD), $TMP0_XMM + vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM + vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM + @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, + $TMP1_XMM, $TMP2_XMM, $LO_XMM ]} + +.Lghash_done: + # Store the updated GHASH accumulator back to memory. + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) +___ +} +$code .= _end_func; + +sub _vaesenc_4x { + my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_; + return <<___; + vaesenc $round_key, $aesdata0, $aesdata0 + vaesenc $round_key, $aesdata1, $aesdata1 + vaesenc $round_key, $aesdata2, $aesdata2 + vaesenc $round_key, $aesdata3, $aesdata3 +___ +} + +sub _ctr_begin_4x { + my ( + $le_ctr, $bswap_mask, $rndkey0, $aesdata0, + $aesdata1, $aesdata2, $aesdata3, $tmp + ) = @_; + return <<___; + # Increment le_ctr four times to generate four vectors of little-endian + # counter blocks, swap each to big-endian, and store them in aesdata[0-3]. + vmovdqu .Linc_2blocks(%rip), $tmp + vpshufb $bswap_mask, $le_ctr, $aesdata0 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata1 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata2 + vpaddd $tmp, $le_ctr, $le_ctr + vpshufb $bswap_mask, $le_ctr, $aesdata3 + vpaddd $tmp, $le_ctr, $le_ctr + + # AES "round zero": XOR in the zero-th round key. + vpxor $rndkey0, $aesdata0, $aesdata0 + vpxor $rndkey0, $aesdata1, $aesdata1 + vpxor $rndkey0, $aesdata2, $aesdata2 + vpxor $rndkey0, $aesdata3, $aesdata3 +___ +} + +# Do the last AES round for four vectors of counter blocks, XOR four vectors of +# source data with the resulting keystream blocks, and write the result to the +# destination buffer. The implementation differs slightly as it takes advantage +# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce +# latency, but it has the same effect. +sub _aesenclast_and_xor_4x { + my ( + $src, $dst, $rndkeylast, $aesdata0, + $aesdata1, $aesdata2, $aesdata3, $t0, + $t1, $t2, $t3 + ) = @_; + return <<___; + vpxor 0*32($src), $rndkeylast, $t0 + vpxor 1*32($src), $rndkeylast, $t1 + vpxor 2*32($src), $rndkeylast, $t2 + vpxor 3*32($src), $rndkeylast, $t3 + vaesenclast $t0, $aesdata0, $aesdata0 + vaesenclast $t1, $aesdata1, $aesdata1 + vaesenclast $t2, $aesdata2, $aesdata2 + vaesenclast $t3, $aesdata3, $aesdata3 + vmovdqu $aesdata0, 0*32($dst) + vmovdqu $aesdata1, 1*32($dst) + vmovdqu $aesdata2, 2*32($dst) + vmovdqu $aesdata3, 3*32($dst) +___ +} + +my $g_update_macro_expansion_count = 0; + +# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out, +# size_t len, const AES_KEY *key, +# const uint8_t ivec[16], +# const u128 Htable[16], +# uint8_t Xi[16]); +# +# This macro generates a GCM encryption or decryption update function with the +# above prototype (with \enc selecting which one). The function computes the +# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and +# writes the resulting encrypted or decrypted data to |out|. It also updates +# the GHASH accumulator |Xi| using the next |len| ciphertext bytes. +# +# |len| must be a multiple of 16. The caller must do any buffering needed to +# ensure this. Both in-place and out-of-place en/decryption are supported. +# +# |ivec| must give the current counter in big-endian format. This function +# loads the counter from |ivec| and increments the loaded counter as needed, but +# it does *not* store the updated counter back to |ivec|. The caller must +# update |ivec| if any more data segments follow. Internally, only the low +# 32-bit word of the counter is incremented, following the GCM standard. +sub _aes_gcm_update { + my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; + my ($enc) = @_; + my $code = ""; + + # Function arguments + my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ) + = $win64 + ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) + : ( @argregs[ 0 .. 5 ], "%r12" ); + + # Additional local variables. + # %rax is used as a temporary register. BE_CTR_PTR is also available as a + # temporary register after the counter is loaded. + + # AES key length in bytes + my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); + + # Pointer to the last AES round key for the chosen AES variant + my $RNDKEYLAST_PTR = "%r11"; + + # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + # using vpshufb, copied to all 128-bit lanes. + my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" ); + + # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + # only the lowest 128-bit lane can be nonzero. When not fully reduced, + # more than one lane may be used, and they need to be XOR'd together. + my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" ); + + # TMP[0-2] are temporary registers. + my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" ); + my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" ); + my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" ); + + # LO and MI are used to accumulate unreduced GHASH products. + my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" ); + my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" ); + + # Cached key powers from Htable + my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" ); + my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" ); + + # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. + my $RNDKEY0 = "%ymm9"; + my $RNDKEYLAST = "%ymm10"; + + # LE_CTR contains the next set of little-endian counter blocks. + my $LE_CTR = "%ymm11"; + + # AESDATA[0-3] hold the counter blocks that are being encrypted by AES. + my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" ); + my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" ); + my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" ); + my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" ); + my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 ); + + my @ghash_4x_args = ( + $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, + $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1, + $TMP2, $LO, $MI, $GHASH_ACC, + $GHASH_ACC_XMM + ); + + if ($win64) { + $code .= <<___; + @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]} + mov 64(%rsp), $BE_CTR_PTR # arg5 + mov 72(%rsp), $HTABLE # arg6 + mov 80(%rsp), $GHASH_ACC_PTR # arg7 + @{[ _save_xmmregs (6 .. 15) ]} + .seh_endprologue +___ + } + else { + $code .= <<___; + @{[ _save_gpregs $GHASH_ACC_PTR ]} + mov 16(%rsp), $GHASH_ACC_PTR # arg7 +___ + } + + if ($enc) { + $code .= <<___; +#ifdef BORINGSSL_DISPATCH_TEST + .extern BORINGSSL_function_hit + movb \$1,BORINGSSL_function_hit+8(%rip) +#endif +___ + } + $code .= <<___; + vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK + + # Load the GHASH accumulator and the starting counter. + # BoringSSL passes these values in big endian format. + vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vbroadcasti128 ($BE_CTR_PTR), $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR + + # Load the AES key length in bytes. BoringSSL stores number of rounds + # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. + movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN + lea -20(,$AESKEYLEN,4), $AESKEYLEN + + # Make RNDKEYLAST_PTR point to the last AES round key. This is the + # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + # respectively. Then load the zero-th and last round keys. + lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR + vbroadcasti128 ($AESKEY), $RNDKEY0 + vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST + + # Finish initializing LE_CTR by adding 1 to the second block. + vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR + + # If there are at least 128 bytes of data, then continue into the loop that + # processes 128 bytes of data at a time. Otherwise skip it. + cmp \$127, $DATALEN + jbe .Lcrypt_loop_4x_done$local_label_suffix + + vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED + vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED +___ + + # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. + + if ($enc) { + $code .= <<___; + # Encrypt the first 4 vectors of plaintext blocks. + @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} + lea 16($AESKEY), %rax +.Lvaesenc_loop_first_4_vecs$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_first_4_vecs$local_label_suffix + @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, + $TMP0, $TMP1, $LO, $MI ]} + sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte + add \$-128, $DATALEN + cmp \$127, $DATALEN + jbe .Lghash_last_ciphertext_4x$local_label_suffix +___ + } + + $code .= <<___; +.align 16 +.Lcrypt_loop_4x$local_label_suffix: + + # Start the AES encryption of the counter blocks. + @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} + cmp \$24, $AESKEYLEN + jl .Laes128$local_label_suffix + je .Laes192$local_label_suffix + # AES-256 + vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +.Laes192$local_label_suffix: + vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} + vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +.Laes128$local_label_suffix: +___ + + # Finish the AES encryption of the counter blocks in AESDATA[0-3], + # interleaved with the GHASH update of the ciphertext blocks. + for my $i ( reverse 1 .. 9 ) { + $code .= <<___; + @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]} + vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0 + @{[ _vaesenc_4x $TMP0, @AESDATA ]} +___ + } + $code .= <<___; + @{[ _ghash_step_4x 9, @ghash_4x_args ]} + + @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte + @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, + $TMP0, $TMP1, $LO, $MI ]} + sub \$-128, $SRC + @{[ !$enc ? "sub \$-128, $DST" : "" ]} + add \$-128, $DATALEN + cmp \$127, $DATALEN + ja .Lcrypt_loop_4x$local_label_suffix +___ + + if ($enc) { + + # Update GHASH with the last set of ciphertext blocks. + $code .= <<___; +.Lghash_last_ciphertext_4x$local_label_suffix: + @{[ _ghash_4x @ghash_4x_args ]} + sub \$-128, $DST +___ + } + + my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. + my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse + + $code .= <<___; +.Lcrypt_loop_4x_done$local_label_suffix: + # Check whether any data remains. + test $DATALEN, $DATALEN + jz .Ldone$local_label_suffix + + # DATALEN is in [16, 32, 48, 64, 80, 96, 112]. + + # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + # is the number of blocks that remain. + lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR + sub $DATALEN, $POWERS_PTR + + # Start collecting the unreduced GHASH intermediate value LO, MI, HI. + vpxor $LO_XMM, $LO_XMM, $LO_XMM + vpxor $MI_XMM, $MI_XMM, $MI_XMM + vpxor $HI_XMM, $HI_XMM, $HI_XMM + + cmp \$64, $DATALEN + jb .Llessthan64bytes$local_label_suffix + + # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks. + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpxor $RNDKEY0, $AESDATA0, $AESDATA0 + vpxor $RNDKEY0, $AESDATA1, $AESDATA1 + lea 16($AESKEY), %rax +.Lvaesenc_loop_tail_1$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + vaesenc $TMP0, $AESDATA0, $AESDATA0 + vaesenc $TMP0, $AESDATA1, $AESDATA1 + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_tail_1$local_label_suffix + vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 + vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 + + # XOR the data with the two vectors of keystream blocks. + vmovdqu 0($SRC), $TMP0 + vmovdqu 32($SRC), $TMP1 + vpxor $TMP0, $AESDATA0, $AESDATA0 + vpxor $TMP1, $AESDATA1, $AESDATA1 + vmovdqu $AESDATA0, 0($DST) + vmovdqu $AESDATA1, 32($DST) + + # Update GHASH with two vectors of ciphertext blocks, without reducing. + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1 + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + vmovdqu 32($POWERS_PTR), $TMP1 + vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO + vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI + vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI + vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2 + vpxor $TMP2, $HI, $HI + + add \$64, $POWERS_PTR + add \$64, $SRC + add \$64, $DST + sub \$64, $DATALEN + jz .Lreduce$local_label_suffix + + vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + + # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks. +.Llessthan64bytes$local_label_suffix: + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 + vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR + vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 + vpxor $RNDKEY0, $AESDATA0, $AESDATA0 + vpxor $RNDKEY0, $AESDATA1, $AESDATA1 + lea 16($AESKEY), %rax +.Lvaesenc_loop_tail_2$local_label_suffix: + vbroadcasti128 (%rax), $TMP0 + vaesenc $TMP0, $AESDATA0, $AESDATA0 + vaesenc $TMP0, $AESDATA1, $AESDATA1 + add \$16, %rax + cmp %rax, $RNDKEYLAST_PTR + jne .Lvaesenc_loop_tail_2$local_label_suffix + vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 + vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 + + # XOR the remaining data with the keystream blocks, and update GHASH with + # the remaining ciphertext blocks without reducing. + + cmp \$32, $DATALEN + jb .Lxor_one_block$local_label_suffix + je .Lxor_two_blocks$local_label_suffix + +.Lxor_three_blocks$local_label_suffix: + vmovdqu 0($SRC), $TMP0 + vmovdqu 32($SRC), $TMP1_XMM + vpxor $TMP0, $AESDATA0, $AESDATA0 + vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM + vmovdqu $AESDATA0, 0($DST) + vmovdqu $AESDATA1_XMM, 32($DST) + + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + vmovdqu 32($POWERS_PTR), $TMP1_XMM + vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM + vpxor $TMP2, $HI, $HI + jmp .Lghash_mul_one_vec_unreduced$local_label_suffix + +.Lxor_two_blocks$local_label_suffix: + vmovdqu ($SRC), $TMP0 + vpxor $TMP0, $AESDATA0, $AESDATA0 + vmovdqu $AESDATA0, ($DST) + vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 + vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 + vmovdqu ($POWERS_PTR), $TMP0 + jmp .Lghash_mul_one_vec_unreduced$local_label_suffix + +.Lxor_one_block$local_label_suffix: + vmovdqu ($SRC), $TMP0_XMM + vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM + vmovdqu $AESDATA0_XMM, ($DST) + vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM + vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM + vmovdqu ($POWERS_PTR), $TMP0_XMM + +.Lghash_mul_one_vec_unreduced$local_label_suffix: + vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $LO, $LO + vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $MI, $MI + vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2 + vpxor $TMP2, $HI, $HI + +.Lreduce$local_label_suffix: + # Finally, do the GHASH reduction. + vbroadcasti128 .Lgfpoly(%rip), $TMP0 + vpclmulqdq \$0x01, $LO, $TMP0, $TMP1 + vpshufd \$0x4e, $LO, $LO + vpxor $LO, $MI, $MI + vpxor $TMP1, $MI, $MI + vpclmulqdq \$0x01, $MI, $TMP0, $TMP1 + vpshufd \$0x4e, $MI, $MI + vpxor $MI, $HI, $HI + vpxor $TMP1, $HI, $HI + vextracti128 \$1, $HI, $GHASH_ACC_XMM + vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + +.Ldone$local_label_suffix: + # Store the updated GHASH accumulator back to memory. + vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM + vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) + + vzeroupper +___ + return $code; +} + +$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1; +$code .= _aes_gcm_update 1; +$code .= _end_func; + +$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1; +$code .= _aes_gcm_update 0; +$code .= _end_func; + +print $code; +close STDOUT or die "error closing STDOUT: $!"; +exit 0; diff --git a/crypto/fipsmodule/modes/gcm.cc.inc b/crypto/fipsmodule/modes/gcm.cc.inc index d3c829a9dc..e77c52589a 100644 --- a/crypto/fipsmodule/modes/gcm.cc.inc +++ b/crypto/fipsmodule/modes/gcm.cc.inc @@ -99,6 +99,11 @@ static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, uint8_t Xi[16], const u128 Htable[16], enum gcm_impl_t impl) { switch (impl) { + case gcm_x86_vaes_avx2: + len &= kSizeTWithoutLower4Bits; + aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; case gcm_x86_vaes_avx10_256: len &= kSizeTWithoutLower4Bits; aes_gcm_enc_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi); @@ -119,6 +124,11 @@ static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, uint8_t Xi[16], const u128 Htable[16], enum gcm_impl_t impl) { switch (impl) { + case gcm_x86_vaes_avx2: + len &= kSizeTWithoutLower4Bits; + aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; case gcm_x86_vaes_avx10_256: len &= kSizeTWithoutLower4Bits; aes_gcm_dec_update_vaes_avx10_256(in, out, len, key, ivec, Htable, Xi); @@ -171,15 +181,21 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, #if defined(GHASH_ASM_X86_64) if (crypto_gcm_clmul_enabled()) { - if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && - CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_BMI2_capable()) { - gcm_init_vpclmulqdq_avx10(out_table, H); - *out_mult = gcm_gmult_vpclmulqdq_avx10; - if (CRYPTO_cpu_avoid_zmm_registers()) { - *out_hash = gcm_ghash_vpclmulqdq_avx10_256; - } else { - *out_hash = gcm_ghash_vpclmulqdq_avx10_512; + if (CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX2_capable()) { + if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && + CRYPTO_is_BMI2_capable()) { + gcm_init_vpclmulqdq_avx10(out_table, H); + *out_mult = gcm_gmult_vpclmulqdq_avx10; + if (CRYPTO_cpu_avoid_zmm_registers()) { + *out_hash = gcm_ghash_vpclmulqdq_avx10_256; + } else { + *out_hash = gcm_ghash_vpclmulqdq_avx10_512; + } + return; } + gcm_init_vpclmulqdq_avx2(out_table, H); + *out_mult = gcm_gmult_vpclmulqdq_avx2; + *out_hash = gcm_ghash_vpclmulqdq_avx2; return; } if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) { @@ -265,6 +281,9 @@ void CRYPTO_gcm128_init_aes_key(GCM128_KEY *gcm_key, const uint8_t *key, } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx10_512 && CRYPTO_is_VAES_capable()) { gcm_key->impl = gcm_x86_vaes_avx10_512; + } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx2 && + CRYPTO_is_VAES_capable()) { + gcm_key->impl = gcm_x86_vaes_avx2; } else if (gcm_key->ghash == gcm_ghash_avx && is_hwaes) { gcm_key->impl = gcm_x86_aesni; } diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc index fafde9c7f2..d195526269 100644 --- a/crypto/fipsmodule/modes/gcm_test.cc +++ b/crypto/fipsmodule/modes/gcm_test.cc @@ -81,6 +81,29 @@ TEST(GCMTest, ABI) { } } } + if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() && + CRYPTO_is_AVX2_capable()) { + AES_KEY aes_key; + static const uint8_t kKey[16] = {0}; + uint8_t iv[16] = {0}; + + CHECK_ABI_SEH(gcm_init_vpclmulqdq_avx2, Htable, kH); + CHECK_ABI_SEH(gcm_gmult_vpclmulqdq_avx2, X, Htable); + for (size_t blocks : kBlockCounts) { + CHECK_ABI_SEH(gcm_ghash_vpclmulqdq_avx2, X, Htable, buf, 16 * blocks); + } + + aes_hw_set_encrypt_key(kKey, 128, &aes_key); + for (size_t blocks : kBlockCounts) { + CHECK_ABI_SEH(aes_gcm_enc_update_vaes_avx2, buf, buf, blocks * 16, + &aes_key, iv, Htable, X); + } + aes_hw_set_decrypt_key(kKey, 128, &aes_key); + for (size_t blocks : kBlockCounts) { + CHECK_ABI_SEH(aes_gcm_dec_update_vaes_avx2, buf, buf, blocks * 16, + &aes_key, iv, Htable, X); + } + } if (CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && CRYPTO_is_BMI2_capable()) { diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h index a1f7bf5777..f041bf8edd 100644 --- a/crypto/fipsmodule/modes/internal.h +++ b/crypto/fipsmodule/modes/internal.h @@ -69,6 +69,7 @@ void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len, enum gcm_impl_t { gcm_separate = 0, // No combined AES-GCM, but may have AES-CTR and GHASH. gcm_x86_aesni, + gcm_x86_vaes_avx2, gcm_x86_vaes_avx10_256, gcm_x86_vaes_avx10_512, gcm_arm64_aes, @@ -200,6 +201,17 @@ size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, uint8_t ivec[16], const u128 Htable[16], uint8_t Xi[16]); +void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); +void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); +void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *in, size_t len); +void aes_gcm_enc_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, const uint8_t ivec[16], + const u128 Htable[16], uint8_t Xi[16]); +void aes_gcm_dec_update_vaes_avx2(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, const uint8_t ivec[16], + const u128 Htable[16], uint8_t Xi[16]); + void gcm_init_vpclmulqdq_avx10(u128 Htable[16], const uint64_t H[2]); void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]); void gcm_ghash_vpclmulqdq_avx10_256(uint8_t Xi[16], const u128 Htable[16], diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc index 8c8d1d10d2..bfd004521b 100644 --- a/crypto/impl_dispatch_test.cc +++ b/crypto/impl_dispatch_test.cc @@ -37,8 +37,9 @@ class ImplDispatchTest : public ::testing::Test { avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable(); ssse3_ = CRYPTO_is_SSSE3_capable(); vaes_ = CRYPTO_is_VAES_capable() && CRYPTO_is_VPCLMULQDQ_capable() && - CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && - CRYPTO_is_BMI2_capable(); + CRYPTO_is_AVX2_capable(); + avx10_ = CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && + CRYPTO_is_BMI2_capable(); avoid_zmm_ = CRYPTO_cpu_avoid_zmm_registers(); is_x86_64_ = #if defined(OPENSSL_X86_64) @@ -80,6 +81,7 @@ class ImplDispatchTest : public ::testing::Test { bool ssse3_ = false; bool is_x86_64_ = false; bool vaes_ = false; + bool avx10_ = false; bool avoid_zmm_ = false; #endif }; @@ -95,6 +97,7 @@ constexpr size_t kFlag_vpaes_encrypt = 4; constexpr size_t kFlag_vpaes_set_encrypt_key = 5; constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_256 = 6; constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx10_512 = 7; +constexpr size_t kFlag_aes_gcm_enc_update_vaes_avx2 = 8; TEST_F(ImplDispatchTest, AEAD_AES_GCM) { AssertFunctionsHit( @@ -107,9 +110,10 @@ TEST_F(ImplDispatchTest, AEAD_AES_GCM) { {kFlag_vpaes_encrypt, ssse3_ && !aesni_}, {kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_}, {kFlag_aes_gcm_enc_update_vaes_avx10_256, - is_x86_64_ && vaes_ && avoid_zmm_}, + is_x86_64_ && vaes_ && avx10_ && avoid_zmm_}, {kFlag_aes_gcm_enc_update_vaes_avx10_512, - is_x86_64_ && vaes_ && !avoid_zmm_}, + is_x86_64_ && vaes_ && avx10_ && !avoid_zmm_}, + {kFlag_aes_gcm_enc_update_vaes_avx2, is_x86_64_ && vaes_ && !avx10_}, }, [] { const uint8_t kZeros[16] = {0}; diff --git a/crypto/internal.h b/crypto/internal.h index d50e755bae..62273c6b26 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -1410,7 +1410,8 @@ inline int CRYPTO_is_ARMv8_SHA512_capable(void) { // 5: vpaes_set_encrypt_key // 6: aes_gcm_enc_update_vaes_avx10_256 // 7: aes_gcm_enc_update_vaes_avx10_512 -extern uint8_t BORINGSSL_function_hit[8]; +// 8: aes_gcm_enc_update_vaes_avx2 +extern uint8_t BORINGSSL_function_hit[9]; #endif // BORINGSSL_DISPATCH_TEST // OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is diff --git a/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/gen/bcm/aes-gcm-avx2-x86_64-apple.S new file mode 100644 index 0000000000..e401e66042 --- /dev/null +++ b/gen/bcm/aes-gcm-avx2-x86_64-apple.S @@ -0,0 +1,1309 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 4 + + +L$bswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +L$gfpoly: +.quad 1, 0xc200000000000000 + + +L$gfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.p2align 5 + +L$ctr_pattern: +.quad 0, 0 +.quad 1, 0 +L$inc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl _gcm_init_vpclmulqdq_avx2 +.private_extern _gcm_init_vpclmulqdq_avx2 + +.p2align 5 +_gcm_init_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 L$gfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 + vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + + + +.globl _gcm_gmult_vpclmulqdq_avx2 +.private_extern _gcm_gmult_vpclmulqdq_avx2 + +.p2align 5 +_gcm_gmult_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu L$bswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu L$gfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + ret + + + +.globl _gcm_ghash_vpclmulqdq_avx2 +.private_extern _gcm_ghash_vpclmulqdq_avx2 + +.p2align 5 +_gcm_ghash_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + vbroadcasti128 L$bswap_mask(%rip),%ymm6 + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vbroadcasti128 L$gfpoly(%rip),%ymm7 + + + cmpq $32,%rcx + jb L$ghash_lastblock + + cmpq $127,%rcx + jbe L$ghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +L$ghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja L$ghash_loop_4x + + + cmpq $32,%rcx + jb L$ghash_loop_1x_done +L$ghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae L$ghash_loop_1x +L$ghash_loop_1x_done: + + + vzeroupper + + +L$ghash_lastblock: + testq %rcx,%rcx + jz L$ghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +L$ghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + ret + + + +.globl _aes_gcm_enc_update_vaes_avx2 +.private_extern _aes_gcm_enc_update_vaes_avx2 + +.p2align 5 +_aes_gcm_enc_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+8(%rip) +#endif + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe L$ghash_last_ciphertext_4x__func1 +.p2align 4 +L$crypt_loop_4x__func1: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func1 + je L$aes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes128__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func1 +L$ghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +L$crypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz L$done__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb L$xor_one_block__func1 + je L$xor_two_blocks__func1 + +L$xor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func1: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +.globl _aes_gcm_dec_update_vaes_avx2 +.private_extern _aes_gcm_dec_update_vaes_avx2 + +.p2align 5 +_aes_gcm_dec_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.p2align 4 +L$crypt_loop_4x__func2: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func2 + je L$aes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes128__func2: + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func2 +L$crypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz L$done__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb L$xor_one_block__func2 + je L$xor_two_blocks__func2 + +L$xor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func2: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +#endif diff --git a/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/gen/bcm/aes-gcm-avx2-x86_64-linux.S new file mode 100644 index 0000000000..b7816cfc5a --- /dev/null +++ b/gen/bcm/aes-gcm-avx2-x86_64-linux.S @@ -0,0 +1,1314 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.align 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl gcm_init_vpclmulqdq_avx2 +.hidden gcm_init_vpclmulqdq_avx2 +.type gcm_init_vpclmulqdq_avx2,@function +.align 32 +gcm_init_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 + vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2 +.globl gcm_gmult_vpclmulqdq_avx2 +.hidden gcm_gmult_vpclmulqdq_avx2 +.type gcm_gmult_vpclmulqdq_avx2,@function +.align 32 +gcm_gmult_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + ret + +.cfi_endproc +.size gcm_gmult_vpclmulqdq_avx2, . - gcm_gmult_vpclmulqdq_avx2 +.globl gcm_ghash_vpclmulqdq_avx2 +.hidden gcm_ghash_vpclmulqdq_avx2 +.type gcm_ghash_vpclmulqdq_avx2,@function +.align 32 +gcm_ghash_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + vbroadcasti128 .Lbswap_mask(%rip),%ymm6 + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vbroadcasti128 .Lgfpoly(%rip),%ymm7 + + + cmpq $32,%rcx + jb .Lghash_lastblock + + cmpq $127,%rcx + jbe .Lghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +.Lghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja .Lghash_loop_4x + + + cmpq $32,%rcx + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + + + vzeroupper + + +.Lghash_lastblock: + testq %rcx,%rcx + jz .Lghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + ret + +.cfi_endproc +.size gcm_ghash_vpclmulqdq_avx2, . - gcm_ghash_vpclmulqdq_avx2 +.globl aes_gcm_enc_update_vaes_avx2 +.hidden aes_gcm_enc_update_vaes_avx2 +.type aes_gcm_enc_update_vaes_avx2,@function +.align 32 +aes_gcm_enc_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+8(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.align 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2 +.globl aes_gcm_dec_update_vaes_avx2 +.hidden aes_gcm_dec_update_vaes_avx2 +.type aes_gcm_dec_update_vaes_avx2,@function +.align 32 +aes_gcm_dec_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.align 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func2: + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2 +#endif diff --git a/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/gen/bcm/aes-gcm-avx2-x86_64-win.asm new file mode 100644 index 0000000000..92015534ba --- /dev/null +++ b/gen/bcm/aes-gcm-avx2-x86_64-win.asm @@ -0,0 +1,1588 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +section .rdata rdata align=8 +ALIGN 16 + + +$L$bswap_mask: + DQ 0x08090a0b0c0d0e0f,0x0001020304050607 + + + + + + + + +$L$gfpoly: + DQ 1,0xc200000000000000 + + +$L$gfpoly_and_internal_carrybit: + DQ 1,0xc200000000000001 + +ALIGN 32 + +$L$ctr_pattern: + DQ 0,0 + DQ 1,0 +$L$inc_2blocks: + DQ 2,0 + DQ 2,0 + +section .text code align=64 + +global gcm_init_vpclmulqdq_avx2 + +ALIGN 32 +gcm_init_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3: + +$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4: + + + + vpshufd xmm3,XMMWORD[rdx],0x4e + + + + + + vpshufd xmm0,xmm3,0xd3 + vpsrad xmm0,xmm0,31 + vpaddq xmm3,xmm3,xmm3 + vpand xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit] + vpxor xmm3,xmm3,xmm0 + + vbroadcasti128 ymm6,XMMWORD[$L$gfpoly] + + + vpclmulqdq xmm0,xmm3,xmm3,0x00 + vpclmulqdq xmm1,xmm3,xmm3,0x01 + vpclmulqdq xmm2,xmm3,xmm3,0x10 + vpxor xmm1,xmm1,xmm2 + vpclmulqdq xmm2,xmm6,xmm0,0x01 + vpshufd xmm0,xmm0,0x4e + vpxor xmm1,xmm1,xmm0 + vpxor xmm1,xmm1,xmm2 + vpclmulqdq xmm5,xmm3,xmm3,0x11 + vpclmulqdq xmm0,xmm6,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm5,xmm5,xmm1 + vpxor xmm5,xmm5,xmm0 + + + + vinserti128 ymm3,ymm5,xmm3,1 + vinserti128 ymm5,ymm5,xmm5,1 + + + vpclmulqdq ymm0,ymm3,ymm5,0x00 + vpclmulqdq ymm1,ymm3,ymm5,0x01 + vpclmulqdq ymm2,ymm3,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm4,ymm3,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + + + vmovdqu YMMWORD[96+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[(128+32)+rcx],ymm0 + + + vpclmulqdq ymm0,ymm4,ymm5,0x00 + vpclmulqdq ymm1,ymm4,ymm5,0x01 + vpclmulqdq ymm2,ymm4,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm3,ymm4,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm3,ymm3,ymm1 + vpxor ymm3,ymm3,ymm0 + + vpclmulqdq ymm0,ymm3,ymm5,0x00 + vpclmulqdq ymm1,ymm3,ymm5,0x01 + vpclmulqdq ymm2,ymm3,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm4,ymm3,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[128+rcx],ymm0 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_init_vpclmulqdq_avx2_5: + + +global gcm_gmult_vpclmulqdq_avx2 + +ALIGN 32 +gcm_gmult_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3: + +$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4: + + vmovdqu xmm0,XMMWORD[rcx] + vmovdqu xmm1,XMMWORD[$L$bswap_mask] + vmovdqu xmm2,XMMWORD[((128-16))+rdx] + vmovdqu xmm3,XMMWORD[$L$gfpoly] + vpshufb xmm0,xmm0,xmm1 + + vpclmulqdq xmm4,xmm0,xmm2,0x00 + vpclmulqdq xmm5,xmm0,xmm2,0x01 + vpclmulqdq xmm6,xmm0,xmm2,0x10 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm3,xmm4,0x01 + vpshufd xmm4,xmm4,0x4e + vpxor xmm5,xmm5,xmm4 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm0,xmm0,xmm2,0x11 + vpclmulqdq xmm4,xmm3,xmm5,0x01 + vpshufd xmm5,xmm5,0x4e + vpxor xmm0,xmm0,xmm5 + vpxor xmm0,xmm0,xmm4 + + + vpshufb xmm0,xmm0,xmm1 + vmovdqu XMMWORD[rcx],xmm0 + movdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5: + + +global gcm_ghash_vpclmulqdq_avx2 + +ALIGN 32 +gcm_ghash_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,72 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6: + +$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7: + + vbroadcasti128 ymm6,XMMWORD[$L$bswap_mask] + vmovdqu xmm5,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm6 + vbroadcasti128 ymm7,XMMWORD[$L$gfpoly] + + + cmp r9,32 + jb NEAR $L$ghash_lastblock + + cmp r9,127 + jbe NEAR $L$ghash_loop_1x + + + vmovdqu ymm8,YMMWORD[128+rdx] + vmovdqu ymm9,YMMWORD[((128+32))+rdx] +$L$ghash_loop_4x: + + vmovdqu ymm1,YMMWORD[r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[rdx] + vpxor ymm1,ymm1,ymm5 + vpclmulqdq ymm3,ymm1,ymm2,0x00 + vpclmulqdq ymm5,ymm1,ymm2,0x11 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm4,ymm0,ymm8,0x00 + + vmovdqu ymm1,YMMWORD[32+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[32+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm8,0x10 + vpxor ymm4,ymm4,ymm0 + + vmovdqu ymm1,YMMWORD[64+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[64+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm9,0x00 + vpxor ymm4,ymm4,ymm0 + + + vmovdqu ymm1,YMMWORD[96+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[96+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm9,0x10 + vpxor ymm4,ymm4,ymm0 + + vpxor ymm4,ymm4,ymm3 + vpxor ymm4,ymm4,ymm5 + + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm0,ymm2,ymm3,0x01 + vpshufd ymm3,ymm3,0x4e + vpxor ymm4,ymm4,ymm3 + vpxor ymm4,ymm4,ymm0 + + vpclmulqdq ymm0,ymm2,ymm4,0x01 + vpshufd ymm4,ymm4,0x4e + vpxor ymm5,ymm5,ymm4 + vpxor ymm5,ymm5,ymm0 + vextracti128 xmm0,ymm5,1 + vpxor xmm5,xmm5,xmm0 + + sub r8,-128 + add r9,-128 + cmp r9,127 + ja NEAR $L$ghash_loop_4x + + + cmp r9,32 + jb NEAR $L$ghash_loop_1x_done +$L$ghash_loop_1x: + vmovdqu ymm0,YMMWORD[r8] + vpshufb ymm0,ymm0,ymm6 + vpxor ymm5,ymm5,ymm0 + vmovdqu ymm0,YMMWORD[((128-32))+rdx] + vpclmulqdq ymm1,ymm5,ymm0,0x00 + vpclmulqdq ymm2,ymm5,ymm0,0x01 + vpclmulqdq ymm3,ymm5,ymm0,0x10 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm3,ymm7,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm2,ymm2,ymm1 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm5,ymm5,ymm0,0x11 + vpclmulqdq ymm1,ymm7,ymm2,0x01 + vpshufd ymm2,ymm2,0x4e + vpxor ymm5,ymm5,ymm2 + vpxor ymm5,ymm5,ymm1 + + vextracti128 xmm0,ymm5,1 + vpxor xmm5,xmm5,xmm0 + add r8,32 + sub r9,32 + cmp r9,32 + jae NEAR $L$ghash_loop_1x +$L$ghash_loop_1x_done: + + + vzeroupper + + +$L$ghash_lastblock: + test r9,r9 + jz NEAR $L$ghash_done + vmovdqu xmm0,XMMWORD[r8] + vpshufb xmm0,xmm0,xmm6 + vpxor xmm5,xmm5,xmm0 + vmovdqu xmm0,XMMWORD[((128-16))+rdx] + vpclmulqdq xmm1,xmm5,xmm0,0x00 + vpclmulqdq xmm2,xmm5,xmm0,0x01 + vpclmulqdq xmm3,xmm5,xmm0,0x10 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm3,xmm7,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm2,xmm2,xmm1 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm5,xmm5,xmm0,0x11 + vpclmulqdq xmm1,xmm7,xmm2,0x01 + vpshufd xmm2,xmm2,0x4e + vpxor xmm5,xmm5,xmm2 + vpxor xmm5,xmm5,xmm1 + + +$L$ghash_done: + + vpshufb xmm5,xmm5,xmm6 + vmovdqu XMMWORD[rcx],xmm5 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + add rsp,72 + ret +$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8: + + +global aes_gcm_enc_update_vaes_avx2 + +ALIGN 32 +aes_gcm_enc_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9: + movdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10: + movdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11: + movdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12: + movdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13: + movdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14: + movdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+8))],1 +%endif + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func1 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + lea rax,[16+r9] +$L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_first_4_vecs__func1 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + add r8,-128 + cmp r8,127 + jbe NEAR $L$ghash_last_ciphertext_4x__func1 +ALIGN 16 +$L$crypt_loop_4x__func1: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func1 + je NEAR $L$aes192__func1 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes192__func1: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes128__func1: + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + sub rdx,-128 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func1 +$L$ghash_last_ciphertext_4x__func1: + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + sub rdx,-128 +$L$crypt_loop_4x_done__func1: + + test r8,r8 + jz NEAR $L$done__func1 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func1 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func1 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm12,ymm0 + vpshufb ymm13,ymm13,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + vpclmulqdq ymm5,ymm12,ymm2,0x00 + vpclmulqdq ymm6,ymm12,ymm2,0x01 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm7,ymm12,ymm2,0x11 + vpclmulqdq ymm4,ymm13,ymm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func1 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func1: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func1 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func1 + je NEAR $L$xor_two_blocks__func1 + +$L$xor_three_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm12,ymm0 + vpshufb xmm13,xmm13,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_two_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm12,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_one_block__func1: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func1: + vpclmulqdq ymm4,ymm12,ymm2,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func1: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm3,ymm2,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + vpclmulqdq ymm3,ymm2,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func1: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + movdqa xmm10,XMMWORD[64+rsp] + movdqa xmm11,XMMWORD[80+rsp] + movdqa xmm12,XMMWORD[96+rsp] + movdqa xmm13,XMMWORD[112+rsp] + movdqa xmm14,XMMWORD[128+rsp] + movdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17: + + +global aes_gcm_dec_update_vaes_avx2 + +ALIGN 32 +aes_gcm_dec_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6: + movdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7: + movdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8: + movdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9: + movdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10: + movdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11: + movdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12: + movdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13: + movdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14: + movdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16: + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func2 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] +ALIGN 16 +$L$crypt_loop_4x__func2: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func2 + je NEAR $L$aes192__func2 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes192__func2: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes128__func2: + + vmovdqu ymm3,YMMWORD[rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[32+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[64+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rcx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + sub rdx,-128 + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func2 +$L$crypt_loop_4x_done__func2: + + test r8,r8 + jz NEAR $L$done__func2 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func2 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func2 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm2,ymm0 + vpshufb ymm13,ymm3,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + vpclmulqdq ymm5,ymm12,ymm2,0x00 + vpclmulqdq ymm6,ymm12,ymm2,0x01 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm7,ymm12,ymm2,0x11 + vpclmulqdq ymm4,ymm13,ymm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func2 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func2: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func2 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func2 + je NEAR $L$xor_two_blocks__func2 + +$L$xor_three_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm2,ymm0 + vpshufb xmm13,xmm3,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_two_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm2,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_one_block__func2: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm2,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func2: + vpclmulqdq ymm4,ymm12,ymm2,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func2: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm3,ymm2,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + vpclmulqdq ymm3,ymm2,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func2: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + movdqa xmm6,XMMWORD[rsp] + movdqa xmm7,XMMWORD[16+rsp] + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + movdqa xmm10,XMMWORD[64+rsp] + movdqa xmm11,XMMWORD[80+rsp] + movdqa xmm12,XMMWORD[96+rsp] + movdqa xmm13,XMMWORD[112+rsp] + movdqa xmm14,XMMWORD[128+rsp] + movdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5 wrt ..imagebase + DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_init_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 9 + DB 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 130 + + DW 0 +$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 96 + + DW 0 +$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 96 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/gen/sources.bzl b/gen/sources.bzl index f91b49e00d..5af0dd23f5 100644 --- a/gen/sources.bzl +++ b/gen/sources.bzl @@ -104,6 +104,8 @@ bcm_internal_headers = [ bcm_sources_asm = [ "gen/bcm/aes-gcm-avx10-x86_64-apple.S", "gen/bcm/aes-gcm-avx10-x86_64-linux.S", + "gen/bcm/aes-gcm-avx2-x86_64-apple.S", + "gen/bcm/aes-gcm-avx2-x86_64-linux.S", "gen/bcm/aesni-gcm-x86_64-apple.S", "gen/bcm/aesni-gcm-x86_64-linux.S", "gen/bcm/aesni-x86-apple.S", @@ -203,6 +205,7 @@ bcm_sources_asm = [ bcm_sources_nasm = [ "gen/bcm/aes-gcm-avx10-x86_64-win.asm", + "gen/bcm/aes-gcm-avx2-x86_64-win.asm", "gen/bcm/aesni-gcm-x86_64-win.asm", "gen/bcm/aesni-x86-win.asm", "gen/bcm/aesni-x86_64-win.asm", diff --git a/gen/sources.cmake b/gen/sources.cmake index 369a9e658e..bbbb9c2449 100644 --- a/gen/sources.cmake +++ b/gen/sources.cmake @@ -110,6 +110,8 @@ set( gen/bcm/aes-gcm-avx10-x86_64-apple.S gen/bcm/aes-gcm-avx10-x86_64-linux.S + gen/bcm/aes-gcm-avx2-x86_64-apple.S + gen/bcm/aes-gcm-avx2-x86_64-linux.S gen/bcm/aesni-gcm-x86_64-apple.S gen/bcm/aesni-gcm-x86_64-linux.S gen/bcm/aesni-x86-apple.S @@ -211,6 +213,7 @@ set( BCM_SOURCES_NASM gen/bcm/aes-gcm-avx10-x86_64-win.asm + gen/bcm/aes-gcm-avx2-x86_64-win.asm gen/bcm/aesni-gcm-x86_64-win.asm gen/bcm/aesni-x86-win.asm gen/bcm/aesni-x86_64-win.asm diff --git a/gen/sources.gni b/gen/sources.gni index d9862d97a9..b5c3d54223 100644 --- a/gen/sources.gni +++ b/gen/sources.gni @@ -104,6 +104,8 @@ bcm_internal_headers = [ bcm_sources_asm = [ "gen/bcm/aes-gcm-avx10-x86_64-apple.S", "gen/bcm/aes-gcm-avx10-x86_64-linux.S", + "gen/bcm/aes-gcm-avx2-x86_64-apple.S", + "gen/bcm/aes-gcm-avx2-x86_64-linux.S", "gen/bcm/aesni-gcm-x86_64-apple.S", "gen/bcm/aesni-gcm-x86_64-linux.S", "gen/bcm/aesni-x86-apple.S", @@ -203,6 +205,7 @@ bcm_sources_asm = [ bcm_sources_nasm = [ "gen/bcm/aes-gcm-avx10-x86_64-win.asm", + "gen/bcm/aes-gcm-avx2-x86_64-win.asm", "gen/bcm/aesni-gcm-x86_64-win.asm", "gen/bcm/aesni-x86-win.asm", "gen/bcm/aesni-x86_64-win.asm", diff --git a/gen/sources.json b/gen/sources.json index 1b482e1bd4..c4604c8365 100644 --- a/gen/sources.json +++ b/gen/sources.json @@ -88,6 +88,8 @@ "asm": [ "gen/bcm/aes-gcm-avx10-x86_64-apple.S", "gen/bcm/aes-gcm-avx10-x86_64-linux.S", + "gen/bcm/aes-gcm-avx2-x86_64-apple.S", + "gen/bcm/aes-gcm-avx2-x86_64-linux.S", "gen/bcm/aesni-gcm-x86_64-apple.S", "gen/bcm/aesni-gcm-x86_64-linux.S", "gen/bcm/aesni-x86-apple.S", @@ -186,6 +188,7 @@ ], "nasm": [ "gen/bcm/aes-gcm-avx10-x86_64-win.asm", + "gen/bcm/aes-gcm-avx2-x86_64-win.asm", "gen/bcm/aesni-gcm-x86_64-win.asm", "gen/bcm/aesni-x86-win.asm", "gen/bcm/aesni-x86_64-win.asm",