From 4880d3228f0c6eb4e884e4cb2b3c7d8015900bea Mon Sep 17 00:00:00 2001 From: luojia65 Date: Thu, 9 Dec 2021 13:04:31 +0800 Subject: [PATCH 1/3] Fix avx512f build on x86-32 --- crates/core_arch/src/x86/avx512f.rs | 749 +++++++++++++++------------- 1 file changed, 389 insertions(+), 360 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7633442aa6..0f244de47e 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -4,6 +4,35 @@ use crate::{ ptr, }; +// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full +// register name (e.g. rax). We have to explicitly override the placeholder to +// use the 32-bit register name in that case. + +#[cfg(target_pointer_width = "32")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p:e}]") + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vpl { + ($inst:expr) => { + concat!($inst, ", [{p}]") + }; +} +#[cfg(target_pointer_width = "32")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p:e}]", $inst2) + }; +} +#[cfg(target_pointer_width = "64")] +macro_rules! vps { + ($inst1:expr, $inst2:expr) => { + concat!($inst1, " [{p}]", $inst2) + }; +} + #[cfg(test)] use stdarch_test::assert_instr; @@ -30333,11 +30362,11 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqu32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30352,11 +30381,11 @@ pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *con pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30371,11 +30400,11 @@ pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __ pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqu64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30390,11 +30419,11 @@ pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *cons pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30409,11 +30438,11 @@ pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { let mut dst: __m512 = src; asm!( - "vmovups {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30428,11 +30457,11 @@ pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { let mut dst: __m512; asm!( - "vmovups {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30447,11 +30476,11 @@ pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m51 pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { let mut dst: __m512d = src; asm!( - "vmovupd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30466,11 +30495,11 @@ pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { let mut dst: __m512d; asm!( - "vmovupd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30485,11 +30514,11 @@ pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512 pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqu32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30504,11 +30533,11 @@ pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *cons pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30523,11 +30552,11 @@ pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqu64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30542,11 +30571,11 @@ pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *cons pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30561,11 +30590,11 @@ pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { let mut dst: __m256 = src; asm!( - "vmovups {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30580,11 +30609,11 @@ pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f3 pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { let mut dst: __m256; asm!( - "vmovups {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30599,11 +30628,11 @@ pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { let mut dst: __m256d = src; asm!( - "vmovupd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30618,11 +30647,11 @@ pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { let mut dst: __m256d; asm!( - "vmovupd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30637,11 +30666,11 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256 pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqu32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30656,11 +30685,11 @@ pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30675,11 +30704,11 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128 pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqu64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30694,11 +30723,11 @@ pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30713,11 +30742,11 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128 pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128 = src; asm!( - "vmovups {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30732,11 +30761,11 @@ pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128; asm!( - "vmovups {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovups {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30751,11 +30780,11 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d = src; asm!( - "vmovupd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30770,11 +30799,11 @@ pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d; asm!( - "vmovupd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovupd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30789,11 +30818,11 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqa32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30808,11 +30837,11 @@ pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *cons pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30827,11 +30856,11 @@ pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqa64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30846,11 +30875,11 @@ pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30865,11 +30894,11 @@ pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m5 pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { let mut dst: __m512 = src; asm!( - "vmovaps {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30884,11 +30913,11 @@ pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f3 pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { let mut dst: __m512; asm!( - "vmovaps {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30903,11 +30932,11 @@ pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { let mut dst: __m512d = src; asm!( - "vmovapd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30922,11 +30951,11 @@ pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f6 pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { let mut dst: __m512d; asm!( - "vmovapd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30941,11 +30970,11 @@ pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqa32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30960,11 +30989,11 @@ pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30979,11 +31008,11 @@ pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m2 pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqa64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -30998,11 +31027,11 @@ pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31017,11 +31046,11 @@ pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m2 pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { let mut dst: __m256 = src; asm!( - "vmovaps {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31036,11 +31065,11 @@ pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32 pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { let mut dst: __m256; asm!( - "vmovaps {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31055,11 +31084,11 @@ pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { let mut dst: __m256d = src; asm!( - "vmovapd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31074,11 +31103,11 @@ pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f6 pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { let mut dst: __m256d; asm!( - "vmovapd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31093,11 +31122,11 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqa32 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31112,11 +31141,11 @@ pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i3 pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31131,11 +31160,11 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqa64 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31150,11 +31179,11 @@ pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i6 pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31169,11 +31198,11 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128 = src; asm!( - "vmovaps {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31188,11 +31217,11 @@ pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) - pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128; asm!( - "vmovaps {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovaps {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31207,11 +31236,11 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d = src; asm!( - "vmovapd {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31226,11 +31255,11 @@ pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d; asm!( - "vmovapd {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovapd {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -31243,11 +31272,11 @@ pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { asm!( - "vmovdqu32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqu32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31259,11 +31288,11 @@ pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: _ #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { asm!( - "vmovdqu64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqu64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31275,11 +31304,11 @@ pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { asm!( - "vmovups [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovups", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31291,11 +31320,11 @@ pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m5 #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { asm!( - "vmovupd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovupd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31307,11 +31336,11 @@ pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m51 #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { asm!( - "vmovdqu32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqu32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31323,11 +31352,11 @@ pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __ #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { asm!( - "vmovdqu64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqu64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31339,11 +31368,11 @@ pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { asm!( - "vmovups [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovups", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31355,11 +31384,11 @@ pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m25 #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { asm!( - "vmovupd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovupd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31371,11 +31400,11 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { asm!( - "vmovdqu32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqu32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31387,11 +31416,11 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { asm!( - "vmovdqu64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqu64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31403,11 +31432,11 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { asm!( - "vmovups [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovups", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31419,11 +31448,11 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { asm!( - "vmovupd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovupd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31435,11 +31464,11 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { asm!( - "vmovdqa32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqa32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31451,11 +31480,11 @@ pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __ #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { asm!( - "vmovdqa64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqa64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31467,11 +31496,11 @@ pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { asm!( - "vmovaps [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovaps", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31483,11 +31512,11 @@ pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m51 #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { asm!( - "vmovapd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovapd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -31499,11 +31528,11 @@ pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512 #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { asm!( - "vmovdqa32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqa32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31515,11 +31544,11 @@ pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { asm!( - "vmovdqa64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqa64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31531,11 +31560,11 @@ pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { asm!( - "vmovaps [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovaps", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31547,11 +31576,11 @@ pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256 #[target_feature(enable = "avx512f,avx512vl,avx")] pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { asm!( - "vmovapd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovapd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -31563,11 +31592,11 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { asm!( - "vmovdqa32 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqa32", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31579,11 +31608,11 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { asm!( - "vmovdqa64 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqa64", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31595,11 +31624,11 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128 #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { asm!( - "vmovaps [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovaps", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -31611,11 +31640,11 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { #[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { asm!( - "vmovapd [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovapd", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } From 9870fea69fafec0ec2a4cbd87d3b0a0cbaaa1052 Mon Sep 17 00:00:00 2001 From: luojia65 Date: Thu, 9 Dec 2021 13:05:32 +0800 Subject: [PATCH 2/3] Fix avx512gfni test fail Caused by multiply overflow when generating test case array. The failed test cases are: core_arch::x86::avx512gfni::tests::test_mm256_gf2p8affine_epi64_epi8 core_arch::x86::avx512gfni::tests::test_mm256_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm256_mask_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm256_maskz_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm512_gf2p8affine_epi64_epi8 core_arch::x86::avx512gfni::tests::test_mm512_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm512_mask_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm512_maskz_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm_gf2p8affine_epi64_epi8 core_arch::x86::avx512gfni::tests::test_mm_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm_mask_gf2p8mul_epi8 core_arch::x86::avx512gfni::tests::test_mm_maskz_gf2p8mul_epi8 ---- core_arch::x86::avx512gfni::tests::test_mm256_gf2p8affine_epi64_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm256_gf2p8affine_epi64_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ---- core_arch::x86::avx512gfni::tests::test_mm256_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm256_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm256_mask_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm256_mask_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm256_maskz_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm256_maskz_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm512_gf2p8affine_epi64_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm512_gf2p8affine_epi64_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm512_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm512_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm512_mask_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm512_mask_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm512_maskz_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm512_maskz_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm_gf2p8affine_epi64_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm_gf2p8affine_epi64_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm_mask_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm_mask_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 ---- core_arch::x86::avx512gfni::tests::test_mm_maskz_gf2p8mul_epi8 stdout ---- thread 'core_arch::x86::avx512gfni::tests::test_mm_maskz_gf2p8mul_epi8' panicked at 'attempt to multiply with overflow', crates\core_arch\src\x86\avx512gfni.rs:822:24 --- crates/core_arch/src/x86/avx512gfni.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512gfni.rs b/crates/core_arch/src/x86/avx512gfni.rs index 7b6d796237..d8ac5c29cc 100644 --- a/crates/core_arch/src/x86/avx512gfni.rs +++ b/crates/core_arch/src/x86/avx512gfni.rs @@ -819,7 +819,7 @@ mod tests { for i in 0..NUM_TEST_ENTRIES { left[i] = (i % 256) as u8; - right[i] = left[i] * 101; + right[i] = left[i].wrapping_mul(101); result[i] = mulbyte(left[i], right[i]); } From 026e67f250ab4207b1fcb5058fe2fa07c12419d4 Mon Sep 17 00:00:00 2001 From: luojia65 Date: Thu, 9 Dec 2021 13:28:07 +0800 Subject: [PATCH 3/3] Fix avx512bw build on x86-32 --- crates/core_arch/src/x86/avx512bw.rs | 182 ++++++++++++++------------- crates/core_arch/src/x86/avx512f.rs | 2 + 2 files changed, 94 insertions(+), 90 deletions(-) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 0363004674..e878602036 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -7,6 +7,8 @@ use crate::{ #[cfg(test)] use stdarch_test::assert_instr; +use super::avx512f::{vpl, vps}; + /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi16&expand=30) @@ -4237,11 +4239,11 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqu16 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4256,11 +4258,11 @@ pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *con pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4275,11 +4277,11 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i { let mut dst: __m512i = src; asm!( - "vmovdqu8 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4294,11 +4296,11 @@ pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *cons pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i { let mut dst: __m512i; asm!( - "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(zmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(zmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4313,11 +4315,11 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5 pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqu16 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4332,11 +4334,11 @@ pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *con pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4351,11 +4353,11 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i { let mut dst: __m256i = src; asm!( - "vmovdqu8 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4370,11 +4372,11 @@ pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *cons pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i { let mut dst: __m256i; asm!( - "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(ymm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(ymm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4389,11 +4391,11 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2 pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqu16 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4408,11 +4410,11 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu16 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4427,11 +4429,11 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128 pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { let mut dst: __m128i = src; asm!( - "vmovdqu8 {2}{{{1}}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - inout(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = inout(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4446,11 +4448,11 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { let mut dst: __m128i; asm!( - "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", - in(reg) mem_addr, - in(kreg) k, - out(xmm_reg) dst, - options(pure, readonly, nostack) + vpl!("vmovdqu8 {dst}{{{k}}} {{z}}"), + p = in(reg) mem_addr, + k = in(kreg) k, + dst = out(xmm_reg) dst, + options(pure, readonly, nostack) ); dst } @@ -4463,11 +4465,11 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i #[target_feature(enable = "avx512f,avx512bw")] pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) { asm!( - "vmovdqu16 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqu16", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -4479,11 +4481,11 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _ #[target_feature(enable = "avx512f,avx512bw")] pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) { asm!( - "vmovdqu8 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(zmm_reg) a, - options(nostack) + vps!("vmovdqu8", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(zmm_reg) a, + options(nostack) ); } @@ -4495,11 +4497,11 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m #[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) { asm!( - "vmovdqu16 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqu16", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -4511,11 +4513,11 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _ #[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) { asm!( - "vmovdqu8 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(ymm_reg) a, - options(nostack) + vps!("vmovdqu8", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(ymm_reg) a, + options(nostack) ); } @@ -4527,11 +4529,11 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m #[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { asm!( - "vmovdqu16 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqu16", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } @@ -4543,11 +4545,11 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12 #[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { asm!( - "vmovdqu8 [{0}]{{{1}}}, {2}", - in(reg) mem_addr, - in(kreg) mask, - in(xmm_reg) a, - options(nostack) + vps!("vmovdqu8", "{{{mask}}}, {a}"), + p = in(reg) mem_addr, + mask = in(kreg) mask, + a = in(xmm_reg) a, + options(nostack) ); } diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 0f244de47e..f735559fed 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -33,6 +33,8 @@ macro_rules! vps { }; } +pub(crate) use {vpl, vps}; + #[cfg(test)] use stdarch_test::assert_instr;