diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 260796cc5eca..a03e380839e3 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -863,6 +863,12 @@ Xorpd Phaddw Phaddd + Punpckhdq + Punpckldq + Punpckhqdq + Punpcklqdq + Pshuflw + Pshufhw )) (type CmpOpcode extern @@ -1347,6 +1353,12 @@ Vcvttps2dq Vphaddw Vphaddd + Vpunpckhdq + Vpunpckldq + Vpunpckhqdq + Vpunpcklqdq + Vpshuflw + Vpshufhw )) (type Avx512Opcode extern @@ -2729,6 +2741,38 @@ (if-let $true (has_avx)) (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2)) +;; Helper for creating `punpckldq` instructions. +(decl x64_punpckldq (Xmm XmmMem) Xmm) +(rule 0 (x64_punpckldq src1 src2) + (xmm_rm_r (SseOpcode.Punpckldq) src1 src2)) +(rule 1 (x64_punpckldq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2)) + +;; Helper for creating `punpckhdq` instructions. +(decl x64_punpckhdq (Xmm XmmMem) Xmm) +(rule 0 (x64_punpckhdq src1 src2) + (xmm_rm_r (SseOpcode.Punpckhdq) src1 src2)) +(rule 1 (x64_punpckhdq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2)) + +;; Helper for creating `punpcklqdq` instructions. +(decl x64_punpcklqdq (Xmm XmmMem) Xmm) +(rule 0 (x64_punpcklqdq src1 src2) + (xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2)) +(rule 1 (x64_punpcklqdq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2)) + +;; Helper for creating `punpckhqdq` instructions. +(decl x64_punpckhqdq (Xmm XmmMem) Xmm) +(rule 0 (x64_punpckhqdq src1 src2) + (xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2)) +(rule 1 (x64_punpckhqdq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2)) + ;; Helper for creating `unpcklps` instructions. (decl x64_unpcklps (Xmm XmmMem) Xmm) (rule 0 (x64_unpcklps src1 src2) @@ -3284,6 +3328,22 @@ (if-let $true (has_avx)) (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2)) +;; Helper for creating `pshuflw` instructions. +(decl x64_pshuflw (XmmMem u8) Xmm) +(rule (x64_pshuflw src imm) + (xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm)) +(rule 1 (x64_pshuflw src imm) + (if-let $true (has_avx)) + (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm)) + +;; Helper for creating `pshufhw` instructions. +(decl x64_pshufhw (XmmMem u8) Xmm) +(rule (x64_pshufhw src imm) + (xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm)) +(rule 1 (x64_pshufhw src imm) + (if-let $true (has_avx)) + (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm)) + ;; Helper for creating `shufps` instructions. (decl x64_shufps (Xmm XmmMem u8) Xmm) (rule 0 (x64_shufps src1 src2 byte) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index ca6e40ce55cf..01ee044ab3d9 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1117,6 +1117,12 @@ pub enum SseOpcode { Xorpd, Phaddw, Phaddd, + Punpckhdq, + Punpckldq, + Punpckhqdq, + Punpcklqdq, + Pshuflw, + Pshufhw, } impl SseOpcode { @@ -1256,7 +1262,13 @@ impl SseOpcode { | SseOpcode::Subpd | SseOpcode::Subsd | SseOpcode::Ucomisd - | SseOpcode::Xorpd => SSE2, + | SseOpcode::Xorpd + | SseOpcode::Punpckldq + | SseOpcode::Punpckhdq + | SseOpcode::Punpcklqdq + | SseOpcode::Punpckhqdq + | SseOpcode::Pshuflw + | SseOpcode::Pshufhw => SSE2, SseOpcode::Pabsb | SseOpcode::Pabsw @@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode { SseOpcode::Xorpd => "xorpd", SseOpcode::Phaddw => "phaddw", SseOpcode::Phaddd => "phaddd", + SseOpcode::Punpckldq => "punpckldq", + SseOpcode::Punpckhdq => "punpckhdq", + SseOpcode::Punpcklqdq => "punpcklqdq", + SseOpcode::Punpckhqdq => "punpckhqdq", + SseOpcode::Pshuflw => "pshuflw", + SseOpcode::Pshufhw => "pshufhw", }; write!(fmt, "{}", name) } @@ -1669,7 +1687,13 @@ impl AvxOpcode { | AvxOpcode::Vcvttpd2dq | AvxOpcode::Vcvttps2dq | AvxOpcode::Vphaddw - | AvxOpcode::Vphaddd => { + | AvxOpcode::Vphaddd + | AvxOpcode::Vpunpckldq + | AvxOpcode::Vpunpckhdq + | AvxOpcode::Vpunpcklqdq + | AvxOpcode::Vpunpckhqdq + | AvxOpcode::Vpshuflw + | AvxOpcode::Vpshufhw => { smallvec![InstructionSet::AVX] } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index b92e1ecd9b55..6d86bffd05f8 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1789,6 +1789,8 @@ pub(crate) fn emit( SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3), SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3), SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), + SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2), + SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; match src { @@ -1946,6 +1948,10 @@ pub(crate) fn emit( SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2), SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2), SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2), + SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2), + SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2), + SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2), + SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2), SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2), SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2), SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2), @@ -2171,6 +2177,10 @@ pub(crate) fn emit( AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F), AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01), AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02), + AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62), + AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A), + AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C), + AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D), _ => panic!("unexpected rmir vex opcode {op:?}"), }; VexInstruction::new() @@ -2400,6 +2410,8 @@ pub(crate) fn emit( let (prefix, map, opcode) = match op { AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08), AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09), + AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70), + AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index ac27aae4d6c5..e33c5ee784be 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3529,16 +3529,98 @@ ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Special case for the `punpckhbw` instruction which interleaves the upper -;; lanes of the two input registers. -(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) +;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit +;; integers within one value, preserving the other four 16-bit integers in that +;; value (either the high or low half). The complicated logic is in the +;; extractors here implemented in Rust and note that there's two cases for each +;; instruction here to match when either the first or second shuffle operand is +;; used. +(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm))) + (x64_pshuflw x imm)) +(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm))) + (x64_pshuflw y imm)) +(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm))) + (x64_pshufhw x imm)) +(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm))) + (x64_pshufhw y imm)) + +(decl pshuflw_lhs_imm (u8) Immediate) +(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm) +(decl pshuflw_rhs_imm (u8) Immediate) +(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm) +(decl pshufhw_lhs_imm (u8) Immediate) +(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm) +(decl pshufhw_rhs_imm (u8) Immediate) +(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm) + +;; Special case for the `pshufd` instruction which will permute 32-bit values +;; within a single register. This is only applicable if the `imm` specified +;; selects 32-bit values from either `x` or `y`, but not both. This means +;; there's one rule for selecting from `x` and another rule for selecting from +;; `y`. +(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm))) + (x64_pshufd x imm)) +(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm))) + (x64_pshufd y imm)) + +(decl pshufd_lhs_imm (u8) Immediate) +(extern extractor pshufd_lhs_imm pshufd_lhs_imm) +(decl pshufd_rhs_imm (u8) Immediate) +(extern extractor pshufd_rhs_imm pshufd_rhs_imm) + +;; Special case for i8-level interleaving of upper/low bytes. +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) (x64_punpckhbw a b)) - -;; Special case for the `punpcklbw` instruction which interleaves the lower -;; lanes of the two input registers. -(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) (x64_punpcklbw a b)) +;; Special case for i16-level interleaving of upper/low bytes. +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908))) + (x64_punpckhwd a b)) +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100))) + (x64_punpcklwd a b)) + +;; Special case for i32-level interleaving of upper/low bytes. +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908))) + (x64_punpckhdq a b)) +(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100))) + (x64_punpckldq a b)) + +;; Special case for i64-level interleaving of upper/low bytes. +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908))) + (x64_punpckhqdq a b)) +(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100))) + (x64_punpcklqdq a b)) + +;; If the vector shift mask is all 0s then that means the first byte of the +;; first operand is broadcast to all bytes. Falling through would load an +;; all-zeros constant from a rip-relative location but it should be slightly +;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero +;; register. +(rule 6 (lower (shuffle a _ (u128_from_immediate 0))) + (x64_pshufb a (xmm_zero $I8X16))) + +;; Special case for the `shufps` instruction which will select two 32-bit values +;; from the first operand and two 32-bit values from the second operand. Note +;; that there is a second case here as well for when the operands can be +;; swapped. +;; +;; Note that the priority of this instruction is currently lower than the above +;; special cases since `shufps` handles many of them and for now it's +;; hypothesized that the dedicated instructions are better than `shufps`. +;; Someone with more knowledge about x86 timings should perhaps reorder the +;; rules here eventually though. +(rule 5 (lower (shuffle x y (shufps_imm imm))) + (x64_shufps x y imm)) +(rule 4 (lower (shuffle x y (shufps_rev_imm imm))) + (x64_shufps y x imm)) + +(decl shufps_imm(u8) Immediate) +(extern extractor shufps_imm shufps_imm) +(decl shufps_rev_imm(u8) Immediate) +(extern extractor shufps_rev_imm shufps_rev_imm) + + ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM ;; register. We statically build `constructed_mask` to zero out any unknown lane ;; indices (may not be completely necessary: verification could fail incorrect diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 61be54a00524..a5549e0fd539 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -999,6 +999,124 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { }, } } + + fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option { + let (a, b, c, d) = self.shuffle32_from_imm(imm)?; + if a < 4 && b < 4 && c < 4 && d < 4 { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option { + let (a, b, c, d) = self.shuffle32_from_imm(imm)?; + // When selecting from the right-hand-side, subtract these all by 4 + // which will bail out if anything is less than 4. Afterwards the check + // is the same as `pshufd_lhs_imm` above. + let a = a.checked_sub(4)?; + let b = b.checked_sub(4)?; + let c = c.checked_sub(4)?; + let d = d.checked_sub(4)?; + if a < 4 && b < 4 && c < 4 && d < 4 { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn shufps_imm(&mut self, imm: Immediate) -> Option { + // The `shufps` instruction selects the first two elements from the + // first vector and the second two elements from the second vector, so + // offset the third/fourth selectors by 4 and then make sure everything + // fits in 32-bits. + let (a, b, c, d) = self.shuffle32_from_imm(imm)?; + let c = c.checked_sub(4)?; + let d = d.checked_sub(4)?; + if a < 4 && b < 4 && c < 4 && d < 4 { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn shufps_rev_imm(&mut self, imm: Immediate) -> Option { + // This is almost the same as `shufps_imm` except the elements that are + // subtracted are reversed. This handles the case that `shufps` + // instruction can be emitted if the order of the operands are swapped. + let (a, b, c, d) = self.shuffle32_from_imm(imm)?; + let a = a.checked_sub(4)?; + let b = b.checked_sub(4)?; + if a < 4 && b < 4 && c < 4 && d < 4 { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option { + // Similar to `shufps` except this operates over 16-bit values so four + // of them must be fixed and the other four must be in-range to encode + // in the immediate. + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option { + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + let a = a.checked_sub(8)?; + let b = b.checked_sub(8)?; + let c = c.checked_sub(8)?; + let d = d.checked_sub(8)?; + let e = e.checked_sub(8)?; + let f = f.checked_sub(8)?; + let g = g.checked_sub(8)?; + let h = h.checked_sub(8)?; + if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] { + Some(a | (b << 2) | (c << 4) | (d << 6)) + } else { + None + } + } + + fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option { + // Similar to `pshuflw` except that the first four operands must be + // fixed and the second four are offset by an extra 4 and tested to + // make sure they're all in the range [4, 8). + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + let e = e.checked_sub(4)?; + let f = f.checked_sub(4)?; + let g = g.checked_sub(4)?; + let h = h.checked_sub(4)?; + if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] { + Some(e | (f << 2) | (g << 4) | (h << 6)) + } else { + None + } + } + + fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option { + // Note that everything here is offset by at least 8 and the upper + // bits are offset by 12 to test they're in the range of [12, 16). + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + let a = a.checked_sub(8)?; + let b = b.checked_sub(8)?; + let c = c.checked_sub(8)?; + let d = d.checked_sub(8)?; + let e = e.checked_sub(12)?; + let f = f.checked_sub(12)?; + let g = g.checked_sub(12)?; + let h = h.checked_sub(12)?; + if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] { + Some(e | (f << 2) | (g << 4) | (h << 6)) + } else { + None + } + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 70bcb7d12e52..3eb0db7ea5df 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -585,9 +585,86 @@ macro_rules! isle_lower_prelude_methods { .collect(); self.lower_ctx.gen_return(rets); } + + /// Attempts to interpret the shuffle immediate `imm` as a shuffle of + /// 32-bit lanes, returning four integers, each of which is less than 8, + /// which represents a permutation of 32-bit lanes as specified by + /// `imm`. + /// + /// For example the shuffle immediate + /// + /// `0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27` + /// + /// would return `Some((0, 2, 4, 6))`. + fn shuffle32_from_imm(&mut self, imm: Immediate) -> Option<(u8, u8, u8, u8)> { + use crate::machinst::isle::shuffle_imm_as_le_lane_idx; + + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + Some(( + shuffle_imm_as_le_lane_idx(4, &bytes[0..4])?, + shuffle_imm_as_le_lane_idx(4, &bytes[4..8])?, + shuffle_imm_as_le_lane_idx(4, &bytes[8..12])?, + shuffle_imm_as_le_lane_idx(4, &bytes[12..16])?, + )) + } + + /// Same as `shuffle32_from_imm`, but for 16-bit lane shuffles. + fn shuffle16_from_imm( + &mut self, + imm: Immediate, + ) -> Option<(u8, u8, u8, u8, u8, u8, u8, u8)> { + use crate::machinst::isle::shuffle_imm_as_le_lane_idx; + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + Some(( + shuffle_imm_as_le_lane_idx(2, &bytes[0..2])?, + shuffle_imm_as_le_lane_idx(2, &bytes[2..4])?, + shuffle_imm_as_le_lane_idx(2, &bytes[4..6])?, + shuffle_imm_as_le_lane_idx(2, &bytes[6..8])?, + shuffle_imm_as_le_lane_idx(2, &bytes[8..10])?, + shuffle_imm_as_le_lane_idx(2, &bytes[10..12])?, + shuffle_imm_as_le_lane_idx(2, &bytes[12..14])?, + shuffle_imm_as_le_lane_idx(2, &bytes[14..16])?, + )) + } }; } +/// Returns the `size`-byte lane referred to by the shuffle immediate specified +/// in `bytes`. +/// +/// This helper is used by `shuffleNN_from_imm` above and is used to interpret a +/// byte-based shuffle as a higher-level shuffle of bigger lanes. This will see +/// if the `bytes` specified, which must have `size` length, specifies a lane in +/// vectors aligned to a `size`-byte boundary. +/// +/// Returns `None` if `bytes` doesn't specify a `size`-byte lane aligned +/// appropriately, or returns `Some(n)` where `n` is the index of the lane being +/// shuffled. +pub fn shuffle_imm_as_le_lane_idx(size: u8, bytes: &[u8]) -> Option { + assert_eq!(bytes.len(), usize::from(size)); + + // The first index in `bytes` must be aligned to a `size` boundary for the + // bytes to be a valid specifier for a lane of `size` bytes. + if bytes[0] % size != 0 { + return None; + } + + // Afterwards the bytes must all be one larger than the prior to specify a + // contiguous sequence of bytes that's being shuffled. Basically `bytes` + // must refer to the entire `size`-byte lane, in little-endian order. + for i in 0..size - 1 { + let idx = usize::from(i); + if bytes[idx] + 1 != bytes[idx + 1] { + return None; + } + } + + // All of the `bytes` are in-order, meaning that this is a valid shuffle + // immediate to specify a lane of `size` bytes. The index, when viewed as + // `size`-byte immediates, will be the first byte divided by the byte size. + Some(bytes[0] / size) +} + /// Helpers specifically for machines that use ABICaller. #[macro_export] #[doc(hidden)] diff --git a/cranelift/codegen/src/prelude_lower.isle b/cranelift/codegen/src/prelude_lower.isle index 51e15cb2a13f..a7e59d5908a8 100644 --- a/cranelift/codegen/src/prelude_lower.isle +++ b/cranelift/codegen/src/prelude_lower.isle @@ -592,6 +592,16 @@ (decl u64_from_constant (u64) Constant) (extern extractor u64_from_constant u64_from_constant) +;; Extracts lane indices, represented as u8's, if the immediate for a +;; `shuffle` instruction represents shuffling N-bit values. The u8 values +;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the +;; N-bit chunks of two concatenated 128-bit vectors starting from the +;; least-significant bits. +(decl shuffle32_from_imm (u8 u8 u8 u8) Immediate) +(extern extractor shuffle32_from_imm shuffle32_from_imm) +(decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate) +(extern extractor shuffle16_from_imm shuffle16_from_imm) + ;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extractor to check for the special case that a `WritableValueRegs` diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif new file mode 100644 index 000000000000..30cf9721e144 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif @@ -0,0 +1,116 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %punpckldq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpunpckldq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpunpckldq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhdq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpunpckhdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpunpckhdq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpcklqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpunpcklqdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpunpcklqdq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpunpckhqdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpunpckhqdq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index 529b95cc5d56..b056d9f1686c 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -52,3 +52,594 @@ block0(v0: i8x16, v1: i8x16): ; popq %rbp ; retq +function %punpcklwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpcklwd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpcklwd %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpckhwd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpckhwd %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufd_0022(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $160, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufd $0xa0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufd_3120(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $39, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufd $0x27, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufd_7546(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $135, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufd $0x87, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %not_single_pshufd(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shufps $78, %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shufps $0x4e, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckldq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpckldq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpckldq %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhdq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpckhdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpckhdq %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpcklqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpcklqdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpcklqdq %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpckhqdq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpckhqdq %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shufps_3277(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shufps $251, %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shufps $0xfb, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shufps_6500(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; shufps $6, %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqa %xmm0, %xmm4 +; movdqa %xmm1, %xmm0 +; shufps $6, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshuflw_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshuflw $27, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshuflw $0x1b, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshuflw_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshuflw $187, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshuflw $0xbb, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshuflw $27, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshuflw $0x1b, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshuflw $119, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshuflw $0x77, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufhw_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufhw $27, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufhw $0x1b, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufhw_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufhw $119, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufhw $0x77, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufhw $27, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufhw $0x1b, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufhw $119, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufhw $0x77, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pxor %xmm3, %xmm3, %xmm3 +; pshufb %xmm0, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pxor %xmm3, %xmm3 +; pshufb %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index 621eebda629e..60b515628d36 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -1,9 +1,10 @@ -test interpret +;; test interpret ;; FIXME(#5915) test run target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { @@ -26,3 +27,234 @@ block0(v0: i8x16): return v1 } ; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7] + +function %punpcklbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + return v2 +} +; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24] + +function %punpckhbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + return v2 +} +; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32] + +function %punpcklwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12] + +function %punpckhwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16] + +function %pshufd_0022(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3] + +function %pshufd_3120(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1] + +function %pshufd_7546(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7] + +function %not_pshufd(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6] + +function %punpckldq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6] + +function %punpckhdq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8] + +function %punpcklqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %punpcklqdq([1 2], [5 6]) == [1 5] + +function %punpckhqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %punpckhqdq([1 2], [5 6]) == [2 6] + +function %shufps_0145(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6] + +function %shufps_3277(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8] + +function %shufps_6500(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1] + +function %pshuflw_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8] + +function %pshuflw_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8] + +function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16] + +function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16] + +function %pshufhw_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5] + +function %pshufhw_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6] + +function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13] + +function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14] + +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} +; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]