Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

polyval: match ideal assembly #44

Merged
merged 1 commit into from
Dec 21, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 44 additions & 54 deletions polyval/src/field/pclmulqdq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,30 +43,53 @@ impl From<M128i> for Block {
impl Add for M128i {
type Output = Self;

/// Adds two POLYVAL field elements.
fn add(self, rhs: Self) -> Self {
M128i(unsafe { xor(self.0, rhs.0) })
}
}

/// XOR is used to add two POLYVAL field elements
#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}

impl Mul for M128i {
type Output = Self;

/// Computes carryless POLYVAL multiplication over GF(2^128).
fn mul(self, rhs: Self) -> Self {
unsafe {
let t1 = pclmulqdq(self.0, rhs.0, 0x00);
let t2 = pclmulqdq(self.0, rhs.0, 0x01);
let t3 = pclmulqdq(self.0, rhs.0, 0x10);
let t4 = pclmulqdq(self.0, rhs.0, 0x11);
let t5 = xor(t2, t3);
let t6 = xor(t4, psrldq8(t5));
let t7 = xor(t1, pslldq8(t5));
M128i(xor(t6, reduce(t7)))
}
unsafe { M128i(clmul(self.0, rhs.0)) }
}
}

/// Computes carryless POLYVAL multiplication over GF(2^128).
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn clmul(lhs: __m128i, rhs: __m128i) -> __m128i {
// pclmulqdq
let t1 = _mm_clmulepi64_si128(lhs, rhs, 0x00);

// pclmulqdq
let t2 = _mm_clmulepi64_si128(lhs, rhs, 0x01);

// pclmulqdq
let t3 = _mm_clmulepi64_si128(lhs, rhs, 0x10);

// pclmulqdq
let t4 = _mm_clmulepi64_si128(lhs, rhs, 0x11);

// pxor
let t5 = _mm_xor_si128(t2, t3);

// psrldq, pxor
let t6 = _mm_xor_si128(t4, _mm_bsrli_si128(t5, 8));

// pslldq, pxor
let t7 = _mm_xor_si128(t1, _mm_bslli_si128(t5, 8));

// reduce, pxor
_mm_xor_si128(t6, reduce(t7))
}

/// Mask value used when performing Montgomery fast reduction.
/// This corresponds to POLYVAL's polynomial with the highest bit unset.
const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
Expand All @@ -75,55 +98,22 @@ const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
/// Algorithm 4: "Montgomery reduction"
///
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn reduce(x: __m128i) -> __m128i {
// `_mm_loadu_si128` performs an unaligned load
// (`u128` is not necessarily aligned to 16-bytes)
#[allow(clippy::cast_ptr_alignment)]
let mask = _mm_loadu_si128(&MASK as *const u128 as *const __m128i);
let a = pclmulqdq(mask, x, 0x01);
let b = xor(pshufd(x), a);
let c = pclmulqdq(mask, b, 0x01);
xor(pshufd(b), c)
}

#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}

#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn pshufd(a: __m128i) -> __m128i {
_mm_shuffle_epi32(a, 0x4e)
}

#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn pslldq8(a: __m128i) -> __m128i {
_mm_bslli_si128(a, 8)
}
// pclmulqdq
let a = _mm_clmulepi64_si128(mask, x, 0x01);

#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn psrldq8(a: __m128i) -> __m128i {
_mm_bsrli_si128(a, 8)
}
// pshufd, pxor
let b = _mm_xor_si128(_mm_shuffle_epi32(x, 0x4e), a);

// TODO(tarcieri): _mm256_clmulepi64_epi128 (vpclmulqdq)
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn pclmulqdq(a: __m128i, b: __m128i, imm: u8) -> __m128i {
// The `imm` value passed to `_mm_clmulepi64_si128` needs to be a literal
// value since it ends up being encoded into the CPU instruction.
match imm {
// Low-Low: `clmul(a[0..8], b[0..8])` (PCLMULLQLQDQ)
0x00 => _mm_clmulepi64_si128(a, b, 0x00),
// pclmulqdq
let c = _mm_clmulepi64_si128(mask, b, 0x01);

// High-Low: `clmul(a[8..16], b[0..8])` (PCLMULHQLQDQ)
0x01 => _mm_clmulepi64_si128(a, b, 0x01),

// Low-High: `clmul(a[0..8], b[8..16])` (PCLMULLQHQDQ)
0x10 => _mm_clmulepi64_si128(a, b, 0x10),

// High-High: `clmul(a[8..16], b[8..16])` (PCLMULHQHQDQ)
0x11 => _mm_clmulepi64_si128(a, b, 0x11),

_ => unreachable!(),
}
// pshufd, pxor
_mm_xor_si128(_mm_shuffle_epi32(b, 0x4e), c)
}