Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Verify Intel intrinsics against upstream definitions #251

Merged
merged 1 commit into from
Dec 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ matrix:
- env: DOCUMENTATION
install: true
script: ci/dox.sh
- script: cargo test --manifest-path stdsimd-verify/Cargo.toml
install: true
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
script: |
cargo install rustfmt-nightly --force
Expand All @@ -40,6 +42,8 @@ install:

script:
- cargo generate-lockfile
# FIXME (travis-ci/travis-ci#8920) shouldn't be necessary...
- python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
- ci/run-docker.sh $TARGET $FEATURES

notifications:
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ categories = ["hardware-support"]
license = "MIT/Apache-2.0"

[workspace]
members = ["stdsimd-verify"]

[badges]
travis-ci = { repository = "BurntSushi/stdsimd" }
Expand Down
4 changes: 3 additions & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ echo "FEATURES=${FEATURES}"
echo "OBJDUMP=${OBJDUMP}"

cargo_test() {
cmd="cargo test --all --target=$TARGET --features $FEATURES --verbose $1 -- --nocapture $2"
cmd="cargo test --target=$TARGET --features $FEATURES $1"
cmd="$cmd -p coresimd -p stdsimd"
cmd="$cmd -- $2"
$cmd
}

Expand Down
16 changes: 8 additions & 8 deletions coresimd/src/x86/i586/abm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,16 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub unsafe fn _popcnt32(x: u32) -> u32 {
x.count_ones()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was on purpose, bit manipulation intrinsics prefer unsigned integers like the ones in std:: do. Whatever we decide to do, we should choose a guideline, and follow it consistently (many other intrinsics that take __m128i take for example u32x4 instead of i32x4).

We should open an issue for this.

pub unsafe fn _popcnt32(x: i32) -> i32 {
x.count_ones() as i32
}

/// Counts the bits that are set.
#[inline(always)]
#[target_feature = "+popcnt"]
#[cfg_attr(test, assert_instr(popcnt))]
pub unsafe fn _popcnt64(x: u64) -> u64 {
x.count_ones() as u64
pub unsafe fn _popcnt64(x: i64) -> i32 {
x.count_ones() as i32
}

#[cfg(test)]
Expand All @@ -64,21 +64,21 @@ mod tests {

#[simd_test = "lzcnt"]
unsafe fn _lzcnt_u32() {
assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
assert_eq!(abm::_lzcnt_u32(0b0101_1010), 25);
}

#[simd_test = "lzcnt"]
unsafe fn _lzcnt_u64() {
assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
assert_eq!(abm::_lzcnt_u64(0b0101_1010), 57);
}

#[simd_test = "popcnt"]
unsafe fn _popcnt32() {
assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
assert_eq!(abm::_popcnt32(0b0101_1010), 4);
}

#[simd_test = "popcnt"]
unsafe fn _popcnt64() {
assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
assert_eq!(abm::_popcnt64(0b0101_1010), 4);
}
}
165 changes: 41 additions & 124 deletions coresimd/src/x86/i586/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -607,77 +607,77 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
}

/// Equal (ordered, non-signaling)
pub const _CMP_EQ_OQ: u8 = 0x00;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unsigned part was on purpose (bit flags consistently used unsigned integers), why we use 8 instead of 32 bits here, I don't know.

Whatever we do, we should do it consistently, and have a guideline for it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So IIRC as mentioned below, arguments that can only be 8-bit long, 16 bit long, etc. were made _8 (e.g. i8) on purpose. This happens on most intrinsics taking an imm8 or imm16.

The Intel intrinsics use here int/long/... very loosely, so this was part of offering better types by default.

pub const _CMP_EQ_OQ: i32 = 0x00;
/// Less-than (ordered, signaling)
pub const _CMP_LT_OS: u8 = 0x01;
pub const _CMP_LT_OS: i32 = 0x01;
/// Less-than-or-equal (ordered, signaling)
pub const _CMP_LE_OS: u8 = 0x02;
pub const _CMP_LE_OS: i32 = 0x02;
/// Unordered (non-signaling)
pub const _CMP_UNORD_Q: u8 = 0x03;
pub const _CMP_UNORD_Q: i32 = 0x03;
/// Not-equal (unordered, non-signaling)
pub const _CMP_NEQ_UQ: u8 = 0x04;
pub const _CMP_NEQ_UQ: i32 = 0x04;
/// Not-less-than (unordered, signaling)
pub const _CMP_NLT_US: u8 = 0x05;
pub const _CMP_NLT_US: i32 = 0x05;
/// Not-less-than-or-equal (unordered, signaling)
pub const _CMP_NLE_US: u8 = 0x06;
pub const _CMP_NLE_US: i32 = 0x06;
/// Ordered (non-signaling)
pub const _CMP_ORD_Q: u8 = 0x07;
pub const _CMP_ORD_Q: i32 = 0x07;
/// Equal (unordered, non-signaling)
pub const _CMP_EQ_UQ: u8 = 0x08;
pub const _CMP_EQ_UQ: i32 = 0x08;
/// Not-greater-than-or-equal (unordered, signaling)
pub const _CMP_NGE_US: u8 = 0x09;
pub const _CMP_NGE_US: i32 = 0x09;
/// Not-greater-than (unordered, signaling)
pub const _CMP_NGT_US: u8 = 0x0a;
pub const _CMP_NGT_US: i32 = 0x0a;
/// False (ordered, non-signaling)
pub const _CMP_FALSE_OQ: u8 = 0x0b;
pub const _CMP_FALSE_OQ: i32 = 0x0b;
/// Not-equal (ordered, non-signaling)
pub const _CMP_NEQ_OQ: u8 = 0x0c;
pub const _CMP_NEQ_OQ: i32 = 0x0c;
/// Greater-than-or-equal (ordered, signaling)
pub const _CMP_GE_OS: u8 = 0x0d;
pub const _CMP_GE_OS: i32 = 0x0d;
/// Greater-than (ordered, signaling)
pub const _CMP_GT_OS: u8 = 0x0e;
pub const _CMP_GT_OS: i32 = 0x0e;
/// True (unordered, non-signaling)
pub const _CMP_TRUE_UQ: u8 = 0x0f;
pub const _CMP_TRUE_UQ: i32 = 0x0f;
/// Equal (ordered, signaling)
pub const _CMP_EQ_OS: u8 = 0x10;
pub const _CMP_EQ_OS: i32 = 0x10;
/// Less-than (ordered, non-signaling)
pub const _CMP_LT_OQ: u8 = 0x11;
pub const _CMP_LT_OQ: i32 = 0x11;
/// Less-than-or-equal (ordered, non-signaling)
pub const _CMP_LE_OQ: u8 = 0x12;
pub const _CMP_LE_OQ: i32 = 0x12;
/// Unordered (signaling)
pub const _CMP_UNORD_S: u8 = 0x13;
pub const _CMP_UNORD_S: i32 = 0x13;
/// Not-equal (unordered, signaling)
pub const _CMP_NEQ_US: u8 = 0x14;
pub const _CMP_NEQ_US: i32 = 0x14;
/// Not-less-than (unordered, non-signaling)
pub const _CMP_NLT_UQ: u8 = 0x15;
pub const _CMP_NLT_UQ: i32 = 0x15;
/// Not-less-than-or-equal (unordered, non-signaling)
pub const _CMP_NLE_UQ: u8 = 0x16;
pub const _CMP_NLE_UQ: i32 = 0x16;
/// Ordered (signaling)
pub const _CMP_ORD_S: u8 = 0x17;
pub const _CMP_ORD_S: i32 = 0x17;
/// Equal (unordered, signaling)
pub const _CMP_EQ_US: u8 = 0x18;
pub const _CMP_EQ_US: i32 = 0x18;
/// Not-greater-than-or-equal (unordered, non-signaling)
pub const _CMP_NGE_UQ: u8 = 0x19;
pub const _CMP_NGE_UQ: i32 = 0x19;
/// Not-greater-than (unordered, non-signaling)
pub const _CMP_NGT_UQ: u8 = 0x1a;
pub const _CMP_NGT_UQ: i32 = 0x1a;
/// False (ordered, signaling)
pub const _CMP_FALSE_OS: u8 = 0x1b;
pub const _CMP_FALSE_OS: i32 = 0x1b;
/// Not-equal (ordered, signaling)
pub const _CMP_NEQ_OS: u8 = 0x1c;
pub const _CMP_NEQ_OS: i32 = 0x1c;
/// Greater-than-or-equal (ordered, non-signaling)
pub const _CMP_GE_OQ: u8 = 0x1d;
pub const _CMP_GE_OQ: i32 = 0x1d;
/// Greater-than (ordered, non-signaling)
pub const _CMP_GT_OQ: u8 = 0x1e;
pub const _CMP_GT_OQ: i32 = 0x1e;
/// True (unordered, signaling)
pub const _CMP_TRUE_US: u8 = 0x1f;
pub const _CMP_TRUE_US: i32 = 0x1f;

/// Compare packed double-precision (64-bit) floating-point
/// elements in `a` and `b` based on the comparison operand
/// specified by `imm8`.
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
macro_rules! call {
($imm8:expr) => { vcmppd(a, b, $imm8) }
}
Expand All @@ -690,7 +690,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
macro_rules! call {
($imm8:expr) => { vcmppd256(a, b, $imm8) }
}
Expand All @@ -703,7 +703,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx,+sse"]
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
macro_rules! call {
($imm8:expr) => { vcmpps(a, b, $imm8) }
}
Expand All @@ -716,7 +716,7 @@ pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
macro_rules! call {
($imm8:expr) => { vcmpps256(a, b, $imm8) }
}
Expand All @@ -731,7 +731,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
macro_rules! call {
($imm8:expr) => { vcmpsd(a, b, $imm8) }
}
Expand All @@ -746,7 +746,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
#[inline(always)]
#[target_feature = "+avx,+sse"]
#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
macro_rules! call {
($imm8:expr) => { vcmpss(a, b, $imm8) }
}
Expand Down Expand Up @@ -862,48 +862,6 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
__m128i::from(dst)
}

/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
/// integer containing the zero-extended integer data.
///
/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
#[inline(always)]
#[target_feature = "+avx"]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
let imm8 = (imm8 & 31) as u32;
(a.extract_unchecked(imm8) as i32) & 0xFF
}

/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
/// integer containing the zero-extended integer data.
///
/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
#[inline(always)]
#[target_feature = "+avx"]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
let imm8 = (imm8 & 15) as u32;
(a.extract_unchecked(imm8) as i32) & 0xFFFF
}

/// Extract a 32-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
let imm8 = (imm8 & 7) as u32;
a.extract_unchecked(imm8)
}

/// Extract a 64-bit integer from `a`, selected with `imm8`.
#[inline(always)]
#[target_feature = "+avx"]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 {
let imm8 = (imm8 & 3) as u32;
a.extract_unchecked(imm8)
}

/// Zero the contents of all XMM or YMM registers.
#[inline(always)]
#[target_feature = "+avx"]
Expand Down Expand Up @@ -1138,7 +1096,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arguments that can only be 8 bit long like this imm8 were make i8 or u8 on purpose.

pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
macro_rules! call {
($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
}
Expand All @@ -1150,7 +1108,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
macro_rules! call {
($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
}
Expand All @@ -1163,7 +1121,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
pub unsafe fn _mm256_permute2f128_si256(
a: i32x8, b: i32x8, imm8: i8
a: i32x8, b: i32x8, imm8: i32
) -> i32x8 {
macro_rules! call {
($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
Expand Down Expand Up @@ -3146,47 +3104,6 @@ mod tests {
assert_eq!(r, __m128i::from(e));
}

#[simd_test = "avx"]
unsafe fn _mm256_extract_epi8() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = i8x32::new(
-1, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31
);
let r1 = avx::_mm256_extract_epi8(a, 0);
let r2 = avx::_mm256_extract_epi8(a, 35);
assert_eq!(r1, 0xFF);
assert_eq!(r2, 3);
}

#[simd_test = "avx"]
unsafe fn _mm256_extract_epi16() {
let a =
i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let r1 = avx::_mm256_extract_epi16(a, 0);
let r2 = avx::_mm256_extract_epi16(a, 19);
assert_eq!(r1, 0xFFFF);
assert_eq!(r2, 3);
}

#[simd_test = "avx"]
unsafe fn _mm256_extract_epi32() {
let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7);
let r1 = avx::_mm256_extract_epi32(a, 0);
let r2 = avx::_mm256_extract_epi32(a, 11);
assert_eq!(r1, -1);
assert_eq!(r2, 3);
}

#[simd_test = "avx"]
unsafe fn _mm256_extract_epi64() {
let a = i64x4::new(0, 1, 2, 3);
let r = avx::_mm256_extract_epi64(a, 3);
assert_eq!(r, 3);
}

#[simd_test = "avx"]
unsafe fn _mm256_zeroall() {
avx::_mm256_zeroall();
Expand Down
Loading