Skip to content

Commit

Permalink
x64: Add shuffle support for pshuf{l,h}w
Browse files Browse the repository at this point in the history
This commit adds special lowering cases for these instructions which
permute 16-bit values within a 128-bit value either within the upper or
lower half of the 128-bit value.
  • Loading branch information
alexcrichton committed Mar 8, 2023
1 parent 5e40d36 commit 36a65c7
Show file tree
Hide file tree
Showing 9 changed files with 450 additions and 6 deletions.
20 changes: 20 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,8 @@
Punpckldq
Punpckhqdq
Punpcklqdq
Pshuflw
Pshufhw
))

(type CmpOpcode extern
Expand Down Expand Up @@ -1355,6 +1357,8 @@
Vpunpckldq
Vpunpckhqdq
Vpunpcklqdq
Vpshuflw
Vpshufhw
))

(type Avx512Opcode extern
Expand Down Expand Up @@ -3324,6 +3328,22 @@
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))

;; Helper for creating `pshuflw` instructions.
(decl x64_pshuflw (XmmMem u8) Xmm)
(rule (x64_pshuflw src imm)
(xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
(rule 1 (x64_pshuflw src imm)
(if-let $true (has_avx))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))

;; Helper for creating `pshufhw` instructions.
(decl x64_pshufhw (XmmMem u8) Xmm)
(rule (x64_pshufhw src imm)
(xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
(rule 1 (x64_pshufhw src imm)
(if-let $true (has_avx))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))

;; Helper for creating `shufps` instructions.
(decl x64_shufps (Xmm XmmMem u8) Xmm)
(rule 0 (x64_shufps src1 src2 byte)
Expand Down
12 changes: 10 additions & 2 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,8 @@ pub enum SseOpcode {
Punpckldq,
Punpckhqdq,
Punpcklqdq,
Pshuflw,
Pshufhw,
}

impl SseOpcode {
Expand Down Expand Up @@ -1264,7 +1266,9 @@ impl SseOpcode {
| SseOpcode::Punpckldq
| SseOpcode::Punpckhdq
| SseOpcode::Punpcklqdq
| SseOpcode::Punpckhqdq => SSE2,
| SseOpcode::Punpckhqdq
| SseOpcode::Pshuflw
| SseOpcode::Pshufhw => SSE2,

SseOpcode::Pabsb
| SseOpcode::Pabsw
Expand Down Expand Up @@ -1513,6 +1517,8 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Punpckhdq => "punpckhdq",
SseOpcode::Punpcklqdq => "punpcklqdq",
SseOpcode::Punpckhqdq => "punpckhqdq",
SseOpcode::Pshuflw => "pshuflw",
SseOpcode::Pshufhw => "pshufhw",
};
write!(fmt, "{}", name)
}
Expand Down Expand Up @@ -1685,7 +1691,9 @@ impl AvxOpcode {
| AvxOpcode::Vpunpckldq
| AvxOpcode::Vpunpckhdq
| AvxOpcode::Vpunpcklqdq
| AvxOpcode::Vpunpckhqdq => {
| AvxOpcode::Vpunpckhqdq
| AvxOpcode::Vpshuflw
| AvxOpcode::Vpshufhw => {
smallvec![InstructionSet::AVX]
}
}
Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,8 @@ pub(crate) fn emit(
SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src {
Expand Down Expand Up @@ -2408,6 +2410,8 @@ pub(crate) fn emit(
let (prefix, map, opcode) = match op {
AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};

Expand Down
24 changes: 24 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3529,6 +3529,30 @@

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
;; integers within one value, preserving the other four 16-bit integers in that
;; value (either the high or low half). The complicated logic is in the
;; extractors here implemented in Rust and note that there's two cases for each
;; instruction here to match when either the first or second shuffle operand is
;; used.
(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
(x64_pshuflw x imm))
(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
(x64_pshuflw y imm))
(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
(x64_pshufhw x imm))
(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
(x64_pshufhw y imm))

(decl pshuflw_lhs_imm (u8) Immediate)
(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
(decl pshuflw_rhs_imm (u8) Immediate)
(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
(decl pshufhw_lhs_imm (u8) Immediate)
(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
(decl pshufhw_rhs_imm (u8) Immediate)
(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)

;; Special case for the `pshufd` instruction which will permute 32-bit values
;; within a single register. This is only applicable if the `imm` specified
;; selects 32-bit values from either `x` or `y`, but not both. This means
Expand Down
64 changes: 64 additions & 0 deletions cranelift/codegen/src/isa/x64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,70 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
None
}
}

fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
// Similar to `shufps` except this operates over 16-bit values so four
// of them must be fixed and the other four must be in-range to encode
// in the immediate.
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
Some(a | (b << 2) | (c << 4) | (d << 6))
} else {
None
}
}

fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
let a = a.checked_sub(8)?;
let b = b.checked_sub(8)?;
let c = c.checked_sub(8)?;
let d = d.checked_sub(8)?;
let e = e.checked_sub(8)?;
let f = f.checked_sub(8)?;
let g = g.checked_sub(8)?;
let h = h.checked_sub(8)?;
if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
Some(a | (b << 2) | (c << 4) | (d << 6))
} else {
None
}
}

fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
// Similar to `pshuflw` except that the first four operands must be
// fixed and the second four are offset by an extra 4 and tested to
// make sure they're all in the range [4, 8).
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
let e = e.checked_sub(4)?;
let f = f.checked_sub(4)?;
let g = g.checked_sub(4)?;
let h = h.checked_sub(4)?;
if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
Some(e | (f << 2) | (g << 4) | (h << 6))
} else {
None
}
}

fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
// Note that everything here is offset by at least 8 and the upper
// bits are offset by 12 to test they're in the range of [12, 16).
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
let a = a.checked_sub(8)?;
let b = b.checked_sub(8)?;
let c = c.checked_sub(8)?;
let d = d.checked_sub(8)?;
let e = e.checked_sub(12)?;
let f = f.checked_sub(12)?;
let g = g.checked_sub(12)?;
let h = h.checked_sub(12)?;
if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
Some(e | (f << 2) | (g << 4) | (h << 6))
} else {
None
}
}
}

impl IsleContext<'_, '_, MInst, X64Backend> {
Expand Down
17 changes: 17 additions & 0 deletions cranelift/codegen/src/machinst/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,23 @@ macro_rules! isle_lower_prelude_methods {
crate::machinst::isle::shuffle_imm(4, &bytes[12..16])?,
))
}

fn shuffle16_from_imm(
&mut self,
imm: Immediate,
) -> Option<(u8, u8, u8, u8, u8, u8, u8, u8)> {
let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
Some((
crate::machinst::isle::shuffle_imm(2, &bytes[0..2])?,
crate::machinst::isle::shuffle_imm(2, &bytes[2..4])?,
crate::machinst::isle::shuffle_imm(2, &bytes[4..6])?,
crate::machinst::isle::shuffle_imm(2, &bytes[6..8])?,
crate::machinst::isle::shuffle_imm(2, &bytes[8..10])?,
crate::machinst::isle::shuffle_imm(2, &bytes[10..12])?,
crate::machinst::isle::shuffle_imm(2, &bytes[12..14])?,
crate::machinst::isle::shuffle_imm(2, &bytes[14..16])?,
))
}
};
}

Expand Down
10 changes: 6 additions & 4 deletions cranelift/codegen/src/prelude_lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -592,13 +592,15 @@
(decl u64_from_constant (u64) Constant)
(extern extractor u64_from_constant u64_from_constant)

;; Extracts 4 lane indices, represented as u8's, if the immediate for a
;; `shuffle` instruction represents shuffling 32-bit values. The u8 values
;; returned will be in the range of 0 to 7, inclusive, and index the 32-bit
;; chunks of two concatenated 128-bit vectors starting from the
;; Extracts lane indices, represented as u8's, if the immediate for a
;; `shuffle` instruction represents shuffling N-bit values. The u8 values
;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the
;; N-bit chunks of two concatenated 128-bit vectors starting from the
;; least-significant bits.
(decl shuffle32_from_imm (u8 u8 u8 u8) Immediate)
(extern extractor shuffle32_from_imm shuffle32_from_imm)
(decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate)
(extern extractor shuffle16_from_imm shuffle16_from_imm)

;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down
Loading

0 comments on commit 36a65c7

Please sign in to comment.