Skip to content

Commit 4972722

Browse files
authored
[X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets (llvm#126420)
More aggressively use broadcast instructions where possible Fixes llvm#50315
1 parent 87ae954 commit 4972722

5 files changed

+91
-73
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+19-1
Original file line numberDiff line numberDiff line change
@@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
1268912689
return true;
1269012690
}
1269112691

12692+
/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12693+
/// the given mask.
12694+
///
12695+
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef<int> Mask,
12696+
int BroadcastableElement = 0) {
12697+
assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12698+
int Size = Mask.size();
12699+
for (int i = 0; i < Size; ++i)
12700+
if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12701+
Mask[i] % Size != BroadcastableElement)
12702+
return false;
12703+
return true;
12704+
}
12705+
1269212706
/// If we are extracting two 128-bit halves of a vector and shuffling the
1269312707
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
1269412708
/// multi-shuffle lowering.
@@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1619016204

1619116205
bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
1619216206
bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16207+
bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16208+
bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
1619316209

1619416210
// If we have lane crossing shuffles AND they don't all come from the lower
1619516211
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1619816214
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
1619916215
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
1620016216
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
16201-
(V2.getOpcode() != ISD::BUILD_VECTOR))
16217+
(V2.getOpcode() != ISD::BUILD_VECTOR) &&
16218+
(!Subtarget.hasAVX2() ||
16219+
!((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
1620216220
return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
1620316221

1620416222
// If we have one input in place, then we can permute the other input and

llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un
151151
define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
152152
; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
153153
; CHECK: # %bb.0:
154-
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
155-
; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
154+
; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1
155+
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
156156
; CHECK-NEXT: retq
157157
%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
158158
ret <4 x double> %r

llvm/test/CodeGen/X86/horizontal-sum.ll

+10-10
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
256256
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
257257
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
258258
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
259-
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
260-
; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
261-
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
262-
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
263-
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
259+
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
260+
; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
261+
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
262+
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
263+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
264264
; AVX2-SLOW-NEXT: retq
265265
;
266266
; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
277277
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
278278
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279279
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
280-
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
281-
; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
282-
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
283-
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
284-
; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
280+
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
281+
; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
282+
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
283+
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
284+
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
285285
; AVX2-FAST-NEXT: retq
286286
%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
287287
%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>

llvm/test/CodeGen/X86/matrix-multiply.ll

+41-41
Original file line numberDiff line numberDiff line change
@@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
659659
; AVX2: # %bb.0: # %entry
660660
; AVX2-NEXT: movq %rdi, %rax
661661
; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
662-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
662+
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
663663
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
664-
; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0
665-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
666-
; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
667-
; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10
668-
; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
664+
; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10
665+
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
666+
; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
667+
; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4
668+
; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
669669
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
670670
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
671671
; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10
672-
; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
672+
; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
673673
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9
674-
; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4
675-
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
674+
; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3
675+
; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3
676676
; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7
677-
; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4
678-
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
679-
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
680-
; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9
677+
; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3
678+
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
679+
; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
680+
; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7
681+
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
682+
; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10
683+
; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7
681684
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
682-
; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11
683-
; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9
684-
; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
685-
; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12
686-
; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9
687-
; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7
688-
; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10
689-
; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
690-
; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10
691-
; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
685+
; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11
686+
; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7
687+
; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4
688+
; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9
689+
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
690+
; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9
691+
; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
692+
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
693+
; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0
692694
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
693695
; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1
694-
; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
695-
; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3
696-
; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1
697-
; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
698-
; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6
699-
; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1
700-
; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2
701-
; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5
696+
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
697+
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
698+
; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6
699+
; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0
700+
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2
701+
; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5
702702
; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2
703-
; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3
704-
; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
705-
; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
706-
; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
707-
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3
708-
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
709-
; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
710-
; AVX2-NEXT: vmovsd %xmm2, 64(%rdi)
711-
; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
712-
; AVX2-NEXT: vmovapd %ymm0, (%rdi)
703+
; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1
704+
; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
705+
; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2
706+
; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
707+
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3
708+
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
709+
; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
710+
; AVX2-NEXT: vmovsd %xmm1, 64(%rdi)
711+
; AVX2-NEXT: vmovapd %ymm0, 32(%rdi)
712+
; AVX2-NEXT: vmovapd %ymm2, (%rdi)
713713
; AVX2-NEXT: vzeroupper
714714
; AVX2-NEXT: retq
715715
;

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

+19-19
Original file line numberDiff line numberDiff line change
@@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
493493
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
494494
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
495495
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
496-
; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
496+
; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
497497
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
498498
; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
499499
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
500-
; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
500+
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
501501
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
502502
; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
503503
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
520520
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
521521
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
522522
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
523-
; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
524523
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
525524
; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
526-
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
527-
; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
528-
; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6
529-
; X86-AVX512-NEXT: vmovapd %ymm6, (%edx)
525+
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
526+
; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
527+
; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
528+
; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
529+
; X86-AVX512-NEXT: vmovapd %ymm4, (%edx)
530530
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
531531
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
532532
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
@@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
563563
;
564564
; X64-AVX2-LABEL: PR48908:
565565
; X64-AVX2: # %bb.0:
566-
; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
566+
; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
567567
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
568568
; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
569569
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
570-
; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
570+
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
571571
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
572572
; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
573573
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
587587
; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
588588
; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
589589
; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
590-
; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
591-
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9]
592-
; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4
593-
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
594-
; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
595-
; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6
596-
; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi)
597-
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1]
598-
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
599-
; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
590+
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
591+
; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
592+
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
593+
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
594+
; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
595+
; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
596+
; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi)
597+
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
598+
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
599+
; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
600600
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11]
601601
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
602602
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]

0 commit comments

Comments
 (0)