-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets #126420
Conversation
… if we're blending inplace/splatable shuffle inputs on AVX2 targets More aggressively use broadcast instructions where possible Fixes llvm#50315
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesMore aggressively use broadcast instructions where possible Fixes #50315 Full diff: https://github.com/llvm/llvm-project/pull/126420.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb2102..9a916a663a64c20 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
return true;
}
+/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
+/// the given mask.
+///
+static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef<int> Mask,
+ int BroadcastableElement = 0) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input &&
+ Mask[i] % Size != BroadcastableElement)
+ return false;
+ return true;
+}
+
/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
@@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+ bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
+ bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
- (V2.getOpcode() != ISD::BUILD_VECTOR))
+ (V2.getOpcode() != ISD::BUILD_VECTOR) &&
+ (!Subtarget.hasAVX2() ||
+ !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
// If we have one input in place, then we can permute the other input and
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index 1baaab0931cb9ad..26a88ab15e3cca1 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un
define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
; CHECK: # %bb.0:
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
+; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; CHECK-NEXT: retq
%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x double> %r
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 5fe1e2996ee9b08..e2cc3ae0dca0af2 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: retq
%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index bdc1ff4c157e4fc..a38ca339cd5e133 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10
-; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
+; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4
+; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10
-; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
+; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9
-; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3
; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9
+; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7
+; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10
+; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11
-; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9
-; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12
-; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9
-; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10
-; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
-; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10
-; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
+; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11
+; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7
+; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9
+; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9
+; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1
-; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3
-; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1
-; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2
-; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5
+; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6
+; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2
+; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5
; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3
-; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
-; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
-; AVX2-NEXT: vmovsd %xmm2, 64(%rdi)
-; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
-; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2
+; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
+; AVX2-NEXT: vmovsd %xmm1, 64(%rdi)
+; AVX2-NEXT: vmovapd %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovapd %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 79602a18693dbed..00af58544e25c0b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
-; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6
-; X86-AVX512-NEXT: vmovapd %ymm6, (%edx)
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
+; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
+; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
+; X86-AVX512-NEXT: vmovapd %ymm4, (%edx)
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
@@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
;
; X64-AVX2-LABEL: PR48908:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9]
-; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
-; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6
-; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi)
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1]
-; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
-; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
+; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
+; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
+; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
+; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi)
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
+; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
+; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/11/builds/12334 Here is the relevant piece of the build log for the reference
|
… if we're blending inplace/splatable shuffle inputs on AVX2 targets (llvm#126420) More aggressively use broadcast instructions where possible Fixes llvm#50315
More aggressively use broadcast instructions where possible
Fixes #50315