diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 260796cc5eca..a03e380839e3 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -863,6 +863,12 @@
             Xorpd
             Phaddw
             Phaddd
+            Punpckhdq
+            Punpckldq
+            Punpckhqdq
+            Punpcklqdq
+            Pshuflw
+            Pshufhw
           ))
 
 (type CmpOpcode extern
@@ -1347,6 +1353,12 @@
             Vcvttps2dq
             Vphaddw
             Vphaddd
+            Vpunpckhdq
+            Vpunpckldq
+            Vpunpckhqdq
+            Vpunpcklqdq
+            Vpshuflw
+            Vpshufhw
           ))
 
 (type Avx512Opcode extern
@@ -2729,6 +2741,38 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2))
 
+;; Helper for creating `punpckldq` instructions.
+(decl x64_punpckldq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckldq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckldq) src1 src2))
+(rule 1 (x64_punpckldq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2))
+
+;; Helper for creating `punpckhdq` instructions.
+(decl x64_punpckhdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhdq) src1 src2))
+(rule 1 (x64_punpckhdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2))
+
+;; Helper for creating `punpcklqdq` instructions.
+(decl x64_punpcklqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpcklqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2))
+(rule 1 (x64_punpcklqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2))
+
+;; Helper for creating `punpckhqdq` instructions.
+(decl x64_punpckhqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2))
+(rule 1 (x64_punpckhqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2))
+
 ;; Helper for creating `unpcklps` instructions.
 (decl x64_unpcklps (Xmm XmmMem) Xmm)
 (rule 0 (x64_unpcklps src1 src2)
@@ -3284,6 +3328,22 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))
 
+;; Helper for creating `pshuflw` instructions.
+(decl x64_pshuflw (XmmMem u8) Xmm)
+(rule (x64_pshuflw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
+(rule 1 (x64_pshuflw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))
+
+;; Helper for creating `pshufhw` instructions.
+(decl x64_pshufhw (XmmMem u8) Xmm)
+(rule (x64_pshufhw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
+(rule 1 (x64_pshufhw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))
+
 ;; Helper for creating `shufps` instructions.
 (decl x64_shufps (Xmm XmmMem u8) Xmm)
 (rule 0 (x64_shufps src1 src2 byte)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index ca6e40ce55cf..01ee044ab3d9 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1117,6 +1117,12 @@ pub enum SseOpcode {
     Xorpd,
     Phaddw,
     Phaddd,
+    Punpckhdq,
+    Punpckldq,
+    Punpckhqdq,
+    Punpcklqdq,
+    Pshuflw,
+    Pshufhw,
 }
 
 impl SseOpcode {
@@ -1256,7 +1262,13 @@ impl SseOpcode {
             | SseOpcode::Subpd
             | SseOpcode::Subsd
             | SseOpcode::Ucomisd
-            | SseOpcode::Xorpd => SSE2,
+            | SseOpcode::Xorpd
+            | SseOpcode::Punpckldq
+            | SseOpcode::Punpckhdq
+            | SseOpcode::Punpcklqdq
+            | SseOpcode::Punpckhqdq
+            | SseOpcode::Pshuflw
+            | SseOpcode::Pshufhw => SSE2,
 
             SseOpcode::Pabsb
             | SseOpcode::Pabsw
@@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Xorpd => "xorpd",
             SseOpcode::Phaddw => "phaddw",
             SseOpcode::Phaddd => "phaddd",
+            SseOpcode::Punpckldq => "punpckldq",
+            SseOpcode::Punpckhdq => "punpckhdq",
+            SseOpcode::Punpcklqdq => "punpcklqdq",
+            SseOpcode::Punpckhqdq => "punpckhqdq",
+            SseOpcode::Pshuflw => "pshuflw",
+            SseOpcode::Pshufhw => "pshufhw",
         };
         write!(fmt, "{}", name)
     }
@@ -1669,7 +1687,13 @@ impl AvxOpcode {
             | AvxOpcode::Vcvttpd2dq
             | AvxOpcode::Vcvttps2dq
             | AvxOpcode::Vphaddw
-            | AvxOpcode::Vphaddd => {
+            | AvxOpcode::Vphaddd
+            | AvxOpcode::Vpunpckldq
+            | AvxOpcode::Vpunpckhdq
+            | AvxOpcode::Vpunpcklqdq
+            | AvxOpcode::Vpunpckhqdq
+            | AvxOpcode::Vpshuflw
+            | AvxOpcode::Vpshufhw => {
                 smallvec![InstructionSet::AVX]
             }
         }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index b92e1ecd9b55..6d86bffd05f8 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1789,6 +1789,8 @@ pub(crate) fn emit(
                 SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
                 SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
                 SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
+                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
+                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             match src {
@@ -1946,6 +1948,10 @@ pub(crate) fn emit(
                 SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
                 SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
                 SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
+                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
+                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
+                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
+                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
                 SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
                 SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
                 SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
@@ -2171,6 +2177,10 @@ pub(crate) fn emit(
                 AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
                 AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
                 AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
+                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
+                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
+                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
+                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
                 _ => panic!("unexpected rmir vex opcode {op:?}"),
             };
             VexInstruction::new()
@@ -2400,6 +2410,8 @@ pub(crate) fn emit(
             let (prefix, map, opcode) = match op {
                 AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
                 AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
+                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
+                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index ac27aae4d6c5..e33c5ee784be 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3529,16 +3529,98 @@
 
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Special case for the `punpckhbw` instruction which interleaves the upper
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
+;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
+;; integers within one value, preserving the other four 16-bit integers in that
+;; value (either the high or low half). The complicated logic is in the
+;; extractors here implemented in Rust and note that there's two cases for each
+;; instruction here to match when either the first or second shuffle operand is
+;; used.
+(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
+      (x64_pshuflw x imm))
+(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
+      (x64_pshuflw y imm))
+(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
+      (x64_pshufhw x imm))
+(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
+      (x64_pshufhw y imm))
+
+(decl pshuflw_lhs_imm (u8) Immediate)
+(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
+(decl pshuflw_rhs_imm (u8) Immediate)
+(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
+(decl pshufhw_lhs_imm (u8) Immediate)
+(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
+(decl pshufhw_rhs_imm (u8) Immediate)
+(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)
+
+;; Special case for the `pshufd` instruction which will permute 32-bit values
+;; within a single register. This is only applicable if the `imm` specified
+;; selects 32-bit values from either `x` or `y`, but not both. This means
+;; there's one rule for selecting from `x` and another rule for selecting from
+;; `y`.
+(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
+      (x64_pshufd x imm))
+(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
+      (x64_pshufd y imm))
+
+(decl pshufd_lhs_imm (u8) Immediate)
+(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
+(decl pshufd_rhs_imm (u8) Immediate)
+(extern extractor pshufd_rhs_imm pshufd_rhs_imm)
+
+;; Special case for i8-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
       (x64_punpckhbw a b))
-
-;; Special case for the `punpcklbw` instruction which interleaves the lower
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
       (x64_punpcklbw a b))
 
+;; Special case for i16-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
+      (x64_punpckhwd a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
+      (x64_punpcklwd a b))
+
+;; Special case for i32-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
+      (x64_punpckhdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
+      (x64_punpckldq a b))
+
+;; Special case for i64-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
+      (x64_punpckhqdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
+      (x64_punpcklqdq a b))
+
+;; If the vector shift mask is all 0s then that means the first byte of the
+;; first operand is broadcast to all bytes. Falling through would load an
+;; all-zeros constant from a rip-relative location but it should be slightly
+;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
+;; register.
+(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
+      (x64_pshufb a (xmm_zero $I8X16)))
+
+;; Special case for the `shufps` instruction which will select two 32-bit values
+;; from the first operand and two 32-bit values from the second operand. Note
+;; that there is a second case here as well for when the operands can be
+;; swapped.
+;;
+;; Note that the priority of this instruction is currently lower than the above
+;; special cases since `shufps` handles many of them and for now it's
+;; hypothesized that the dedicated instructions are better than `shufps`.
+;; Someone with more knowledge about x86 timings should perhaps reorder the
+;; rules here eventually though.
+(rule 5 (lower (shuffle x y (shufps_imm imm)))
+      (x64_shufps x y imm))
+(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
+      (x64_shufps y x imm))
+
+(decl shufps_imm(u8) Immediate)
+(extern extractor shufps_imm shufps_imm)
+(decl shufps_rev_imm(u8) Immediate)
+(extern extractor shufps_rev_imm shufps_rev_imm)
+
+
 ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
 ;; register. We statically build `constructed_mask` to zero out any unknown lane
 ;; indices (may not be completely necessary: verification could fail incorrect
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 61be54a00524..a5549e0fd539 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -999,6 +999,124 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
             },
         }
     }
+
+    fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        // When selecting from the right-hand-side, subtract these all by 4
+        // which will bail out if anything is less than 4. Afterwards the check
+        // is the same as `pshufd_lhs_imm` above.
+        let a = a.checked_sub(4)?;
+        let b = b.checked_sub(4)?;
+        let c = c.checked_sub(4)?;
+        let d = d.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // The `shufps` instruction selects the first two elements from the
+        // first vector and the second two elements from the second vector, so
+        // offset the third/fourth selectors by 4 and then make sure everything
+        // fits in 32-bits.
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let c = c.checked_sub(4)?;
+        let d = d.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // This is almost the same as `shufps_imm` except the elements that are
+        // subtracted are reversed. This handles the case that `shufps`
+        // instruction can be emitted if the order of the operands are swapped.
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let a = a.checked_sub(4)?;
+        let b = b.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Similar to `shufps` except this operates over 16-bit values so four
+        // of them must be fixed and the other four must be in-range to encode
+        // in the immediate.
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let a = a.checked_sub(8)?;
+        let b = b.checked_sub(8)?;
+        let c = c.checked_sub(8)?;
+        let d = d.checked_sub(8)?;
+        let e = e.checked_sub(8)?;
+        let f = f.checked_sub(8)?;
+        let g = g.checked_sub(8)?;
+        let h = h.checked_sub(8)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Similar to `pshuflw` except that the first four operands must be
+        // fixed and the second four are offset by an extra 4 and tested to
+        // make sure they're all in the range [4, 8).
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let e = e.checked_sub(4)?;
+        let f = f.checked_sub(4)?;
+        let g = g.checked_sub(4)?;
+        let h = h.checked_sub(4)?;
+        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
+            Some(e | (f << 2) | (g << 4) | (h << 6))
+        } else {
+            None
+        }
+    }
+
+    fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // Note that everything here is offset by at least 8 and the upper
+        // bits are offset by 12 to test they're in the range of [12, 16).
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+        let a = a.checked_sub(8)?;
+        let b = b.checked_sub(8)?;
+        let c = c.checked_sub(8)?;
+        let d = d.checked_sub(8)?;
+        let e = e.checked_sub(12)?;
+        let f = f.checked_sub(12)?;
+        let g = g.checked_sub(12)?;
+        let h = h.checked_sub(12)?;
+        if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {
+            Some(e | (f << 2) | (g << 4) | (h << 6))
+        } else {
+            None
+        }
+    }
 }
 
 impl IsleContext<'_, '_, MInst, X64Backend> {
diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
index 70bcb7d12e52..3eb0db7ea5df 100644
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -585,9 +585,86 @@ macro_rules! isle_lower_prelude_methods {
                 .collect();
             self.lower_ctx.gen_return(rets);
         }
+
+        /// Attempts to interpret the shuffle immediate `imm` as a shuffle of
+        /// 32-bit lanes, returning four integers, each of which is less than 8,
+        /// which represents a permutation of 32-bit lanes as specified by
+        /// `imm`.
+        ///
+        /// For example the shuffle immediate
+        ///
+        /// `0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27`
+        ///
+        /// would return `Some((0, 2, 4, 6))`.
+        fn shuffle32_from_imm(&mut self, imm: Immediate) -> Option<(u8, u8, u8, u8)> {
+            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
+
+            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
+            Some((
+                shuffle_imm_as_le_lane_idx(4, &bytes[0..4])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[4..8])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[8..12])?,
+                shuffle_imm_as_le_lane_idx(4, &bytes[12..16])?,
+            ))
+        }
+
+        /// Same as `shuffle32_from_imm`, but for 16-bit lane shuffles.
+        fn shuffle16_from_imm(
+            &mut self,
+            imm: Immediate,
+        ) -> Option<(u8, u8, u8, u8, u8, u8, u8, u8)> {
+            use crate::machinst::isle::shuffle_imm_as_le_lane_idx;
+            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
+            Some((
+                shuffle_imm_as_le_lane_idx(2, &bytes[0..2])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[2..4])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[4..6])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[6..8])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[8..10])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[10..12])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[12..14])?,
+                shuffle_imm_as_le_lane_idx(2, &bytes[14..16])?,
+            ))
+        }
     };
 }
 
+/// Returns the `size`-byte lane referred to by the shuffle immediate specified
+/// in `bytes`.
+///
+/// This helper is used by `shuffleNN_from_imm` above and is used to interpret a
+/// byte-based shuffle as a higher-level shuffle of bigger lanes. This will see
+/// if the `bytes` specified, which must have `size` length, specifies a lane in
+/// vectors aligned to a `size`-byte boundary.
+///
+/// Returns `None` if `bytes` doesn't specify a `size`-byte lane aligned
+/// appropriately, or returns `Some(n)` where `n` is the index of the lane being
+/// shuffled.
+pub fn shuffle_imm_as_le_lane_idx(size: u8, bytes: &[u8]) -> Option<u8> {
+    assert_eq!(bytes.len(), usize::from(size));
+
+    // The first index in `bytes` must be aligned to a `size` boundary for the
+    // bytes to be a valid specifier for a lane of `size` bytes.
+    if bytes[0] % size != 0 {
+        return None;
+    }
+
+    // Afterwards the bytes must all be one larger than the prior to specify a
+    // contiguous sequence of bytes that's being shuffled. Basically `bytes`
+    // must refer to the entire `size`-byte lane, in little-endian order.
+    for i in 0..size - 1 {
+        let idx = usize::from(i);
+        if bytes[idx] + 1 != bytes[idx + 1] {
+            return None;
+        }
+    }
+
+    // All of the `bytes` are in-order, meaning that this is a valid shuffle
+    // immediate to specify a lane of `size` bytes. The index, when viewed as
+    // `size`-byte immediates, will be the first byte divided by the byte size.
+    Some(bytes[0] / size)
+}
+
 /// Helpers specifically for machines that use ABICaller.
 #[macro_export]
 #[doc(hidden)]
diff --git a/cranelift/codegen/src/prelude_lower.isle b/cranelift/codegen/src/prelude_lower.isle
index 51e15cb2a13f..a7e59d5908a8 100644
--- a/cranelift/codegen/src/prelude_lower.isle
+++ b/cranelift/codegen/src/prelude_lower.isle
@@ -592,6 +592,16 @@
 (decl u64_from_constant (u64) Constant)
 (extern extractor u64_from_constant u64_from_constant)
 
+;; Extracts lane indices, represented as u8's, if the immediate for a
+;; `shuffle` instruction represents shuffling N-bit values. The u8 values
+;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the
+;; N-bit chunks of two concatenated 128-bit vectors starting from the
+;; least-significant bits.
+(decl shuffle32_from_imm (u8 u8 u8 u8) Immediate)
+(extern extractor shuffle32_from_imm shuffle32_from_imm)
+(decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate)
+(extern extractor shuffle16_from_imm shuffle16_from_imm)
+
 ;;;; Helpers for generating returns ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Extractor to check for the special case that a `WritableValueRegs`
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
new file mode 100644
index 000000000000..30cf9721e144
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
@@ -0,0 +1,116 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckldq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckldq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckhdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckhdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpcklqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpcklqdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpunpckhqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpunpckhqdq %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif
index 529b95cc5d56..b056d9f1686c 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -52,3 +52,594 @@ block0(v0: i8x16, v1: i8x16):
 ;   popq %rbp
 ;   retq
 
+function %punpcklwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpcklwd %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpcklwd %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhwd %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhwd %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_0022(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $160, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0xa0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_3120(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $39, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0x27, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufd_7546(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $135, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufd $0x87, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %not_single_pshufd(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shufps  $78, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   shufps $0x4e, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckldq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckldq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpcklqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpcklqdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   punpckhqdq %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   punpckhqdq %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shufps  $251, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   shufps $0xfb, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm4
+;   movdqa  %xmm1, %xmm0
+;   shufps  $6, %xmm0, %xmm4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm0, %xmm4
+;   movdqa %xmm1, %xmm0
+;   shufps $6, %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $27, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x1b, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $187, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0xbb, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $27, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x1b, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshuflw $119, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshuflw $0x77, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $27, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x1b, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $119, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x77, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $27, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x1b, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufhw $119, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pshufhw $0x77, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pxor    %xmm3, %xmm3, %xmm3
+;   pshufb  %xmm0, %xmm3, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pxor %xmm3, %xmm3
+;   pshufb %xmm3, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index 621eebda629e..60b515628d36 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -1,9 +1,10 @@
-test interpret
+;; test interpret ;; FIXME(#5915)
 test run
 target aarch64
 target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
+target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
 
 function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
@@ -26,3 +27,234 @@ block0(v0: i8x16):
     return v1
 }
 ; run: %shuffle1([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) == [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7]
+
+function %punpcklbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23]
+    return v2
+}
+; run: %punpcklbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 2 18 3 19 4 20 5 21 6 22 7 23 8 24]
+
+function %punpckhbw(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31]
+    return v2
+}
+; run: %punpckhbw([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [9 25 10 26 11 27 12 28 13 29 14 30 15 31 16 32]
+
+function %punpcklwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %punpcklwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 2 10 3 11 4 12]
+
+function %punpckhwd(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %punpckhwd([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [5 13 6 14 7 15 8 16]
+
+function %pshufd_0022(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [1 1 3 3]
+
+function %pshufd_3120(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 4 5 6 7 8 9 10 11 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [4 2 3 1]
+
+function %pshufd_7546(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [28 29 30 31 20 21 22 23 16 17 18 19 24 25 26 27]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [8 6 5 7]
+
+function %not_pshufd(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %pshufd_0022([1 2 3 4], [5 6 7 8]) == [3 4 5 6]
+
+function %punpckldq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [1 5 2 6]
+
+function %punpckhdq(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %punpckldq([1 2 3 4], [5 6 7 8]) == [3 7 4 8]
+
+function %punpcklqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %punpcklqdq([1 2], [5 6]) == [1 5]
+
+function %punpckhqdq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i64x2 little v4
+    return v5
+}
+; run: %punpckhqdq([1 2], [5 6]) == [2 6]
+
+function %shufps_0145(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]
+
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]
+
+function %pshuflw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 2 1 5 6 7 8]
+
+function %pshuflw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [6 7 4 5 6 7 4 5 8 9 10 11 12 13 14 15]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [4 3 4 3 5 6 7 8]
+
+function %pshuflw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 20 21 18 19 16 17 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 11 10 9 13 14 15 16]
+
+function %pshuflw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [22 23 18 19 22 23 18 19 24 25 26 27 28 29 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshuflw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [12 10 12 10 13 14 15 16]
+
+function %pshufhw_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 12 13 10 11 8 9]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 7 6 5]
+
+function %pshufhw_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 14 15 10 11 14 15 10 11]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 8 6 8 6]
+
+function %pshufhw_rhs_3210(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 28 29 26 27 24 25]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_rhs_3210([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 15 14 13]
+
+function %pshufhw_rhs_3131(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 18 19 20 21 22 23 30 31 26 27 30 31 26 27]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]
+
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]