This repository was archived by the owner on Nov 6, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
optimization of U256 #515
Merged
Merged
optimization of U256 #515
Changes from 8 commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
dd8652d
u256 to inline assembly opt
NikVolf 476bb85
r m/r + setc/xor
NikVolf 7821505
sub x64 optimize
NikVolf ccaa194
mul, bench showtime
NikVolf 0794049
fix naughty macros
NikVolf 370d901
Merge branch 'master' into bigint-opt
NikVolf da69ea5
inline
NikVolf ae76a50
inline test
NikVolf f17d893
fixed mul, fixed register pref
NikVolf 5467b06
fix bench iter
NikVolf fb5779a
specific feature for asm opt
NikVolf 7525ff2
removed artefact cls/pushf/popf
NikVolf 864e754
overflowing_sub in sub
NikVolf 5d22ad3
counter jump better
NikVolf 2ee4a0c
mistake of ne/jcxz
NikVolf 600859e
[ci skip] flush
NikVolf e946e2a
epic mul overflow bug
NikVolf 4b0ec64
random init for benches
NikVolf f29417e
allow dead code for macros expansion
NikVolf e95538f
[ci skip] style fixes, multipart add test
NikVolf 228e3fe
[ci skip] multipart sub test
NikVolf 3858a20
[ci skip] mul multipart tests
NikVolf 023c623
mul overflow multipart test
NikVolf 5013c4d
naughty overflow bug fixed
NikVolf File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// Copyright 2015, 2016 Ethcore (UK) Ltd. | ||
// This file is part of Parity. | ||
|
||
// Parity is free software: you can redistribute it and/or modify | ||
// it under the terms of the GNU General Public License as published by | ||
// the Free Software Foundation, either version 3 of the License, or | ||
// (at your option) any later version. | ||
|
||
// Parity is distributed in the hope that it will be useful, | ||
// but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
// GNU General Public License for more details. | ||
|
||
// You should have received a copy of the GNU General Public License | ||
// along with Parity. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
//! benchmarking for rlp | ||
//! should be started with: | ||
//! ```bash | ||
//! multirust run nightly cargo bench | ||
//! ``` | ||
|
||
#![feature(test)] | ||
#![feature(asm)] | ||
|
||
extern crate test; | ||
extern crate ethcore_util; | ||
|
||
use test::{Bencher, black_box}; | ||
use ethcore_util::uint::*; | ||
|
||
#[bench] | ||
fn u256_add(b: &mut Bencher) { | ||
b.iter(|| { | ||
let n = black_box(10000); | ||
(0..n).fold(U256::from(1234599u64), |old, new| { old.overflowing_add(U256::from(new)).0 }) | ||
}); | ||
} | ||
|
||
|
||
#[bench] | ||
fn u256_sub(b: &mut Bencher) { | ||
b.iter(|| { | ||
let n = black_box(10000); | ||
(0..n).fold(U256::from(::std::u64::MAX), |old, new| { old.overflowing_sub(U256::from(new)).0 }) | ||
}); | ||
} | ||
|
||
#[bench] | ||
fn u256_mul(b: &mut Bencher) { | ||
b.iter(|| { | ||
let n = black_box(10000); | ||
(0..n).fold(U256([12345u64, 0u64, 0u64, 0u64]), |old, new| { old.overflowing_mul(U256::from(new)).0 }) | ||
}); | ||
} | ||
|
||
|
||
#[bench] | ||
fn u128_mul(b: &mut Bencher) { | ||
b.iter(|| { | ||
let n = black_box(10000); | ||
(0..n).fold(U128([12345u64, 0u64]), |old, new| { old.overflowing_mul(U128::from(new)).0 }) | ||
}); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
|
||
#![warn(missing_docs)] | ||
#![cfg_attr(feature="dev", feature(plugin))] | ||
#![cfg_attr(feature="dev", feature(asm))] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
#![cfg_attr(feature="dev", plugin(clippy))] | ||
|
||
// Clippy settings | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,232 @@ macro_rules! impl_map_from { | |
} | ||
} | ||
|
||
#[cfg(not(all(feature="dev", target_arch = "x86_64")))] | ||
macro_rules! uint_overflowing_add { | ||
($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => ({ | ||
uint_overflowing_add_reg!($name, $n_words, $self_expr, $other) | ||
}) | ||
} | ||
|
||
macro_rules! uint_overflowing_add_reg { | ||
($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => ({ | ||
let $name(ref me) = $self_expr; | ||
let $name(ref you) = $other; | ||
let mut ret = [0u64; $n_words]; | ||
let mut carry = [0u64; $n_words]; | ||
let mut b_carry = false; | ||
let mut overflow = false; | ||
|
||
for i in 0..$n_words { | ||
ret[i] = me[i].wrapping_add(you[i]); | ||
|
||
if ret[i] < me[i] { | ||
if i < $n_words - 1 { | ||
carry[i + 1] = 1; | ||
b_carry = true; | ||
} else { | ||
overflow = true; | ||
} | ||
} | ||
} | ||
if b_carry { | ||
let ret = overflowing!($name(ret).overflowing_add($name(carry)), overflow); | ||
(ret, overflow) | ||
} else { | ||
($name(ret), overflow) | ||
} | ||
}) | ||
} | ||
|
||
|
||
#[cfg(all(feature="dev", target_arch = "x86_64"))] | ||
macro_rules! uint_overflowing_add { | ||
(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let mut result: [u64; 4] = unsafe { mem::uninitialized() }; | ||
let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) }; | ||
let other_t: &[u64; 4] = unsafe { &mem::transmute($other) }; | ||
|
||
let overflow: u8; | ||
unsafe { | ||
asm!(" | ||
adc $9, %r8 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not use auto-assigned registers? |
||
adc $10, %r9 | ||
adc $11, %r10 | ||
adc $12, %r11 | ||
setc %al | ||
" | ||
: "={r8}"(result[0]), "={r9}"(result[1]), "={r10}"(result[2]), "={r11}"(result[3]), "={al}"(overflow) | ||
: "{r8}"(self_t[0]), "{r9}"(self_t[1]), "{r10}"(self_t[2]), "{r11}"(self_t[3]), | ||
"m"(other_t[0]), "m"(other_t[1]), "m"(other_t[2]), "m"(other_t[3]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Input should be allowed to be a register in case it is inlined and compiler already has the value in the register |
||
: | ||
: | ||
); | ||
} | ||
(U256(result), overflow != 0) | ||
}); | ||
($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => ( | ||
uint_overflowing_add_reg!($name, $n_words, $self_expr, $other) | ||
) | ||
} | ||
|
||
#[cfg(not(all(feature="dev", target_arch = "x86_64")))] | ||
macro_rules! uint_overflowing_sub { | ||
($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let res = overflowing!((!$other).overflowing_add(From::from(1u64))); | ||
let res = overflowing!($self_expr.overflowing_add(res)); | ||
(res, $self_expr < $other) | ||
}) | ||
} | ||
|
||
#[cfg(all(feature="dev", target_arch = "x86_64"))] | ||
macro_rules! uint_overflowing_sub { | ||
(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let mut result: [u64; 4] = unsafe { mem::uninitialized() }; | ||
let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) }; | ||
let other_t: &[u64; 4] = unsafe { &mem::transmute($other) }; | ||
|
||
let overflow: u8; | ||
unsafe { | ||
asm!(" | ||
sbb $9, %r8 | ||
sbb $10, %r9 | ||
sbb $11, %r10 | ||
sbb $12, %r11 | ||
setb %al" | ||
: "=r"(result[0]), "=r"(result[1]), "=r"(result[2]), "=r"(result[3]), "={al}"(overflow) | ||
: "0"(self_t[0]), "1"(self_t[1]), "2"(self_t[2]), "3"(self_t[3]), "mr"(other_t[0]), "mr"(other_t[1]), "mr"(other_t[2]), "mr"(other_t[3]) | ||
: | ||
: | ||
); | ||
} | ||
(U256(result), overflow != 0) | ||
}); | ||
($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let res = overflowing!((!$other).overflowing_add(From::from(1u64))); | ||
let res = overflowing!($self_expr.overflowing_add(res)); | ||
(res, $self_expr < $other) | ||
}) | ||
} | ||
|
||
#[cfg(all(feature="dev", target_arch = "x86_64"))] | ||
macro_rules! uint_overflowing_mul { | ||
(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let mut result: [u64; 4] = unsafe { mem::uninitialized() }; | ||
let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) }; | ||
let other_t: &[u64; 4] = unsafe { &mem::transmute($other) }; | ||
|
||
let overflow: u8; | ||
unsafe { | ||
asm!(" | ||
mov $5, %rax | ||
mulq $9 | ||
mov %rax, %r8 | ||
adc $6, %rdx | ||
pushf | ||
|
||
mov %rdx, %rax | ||
mulq $9 | ||
popf | ||
adc $$0, %rax | ||
adc $7, %rdx | ||
pushf | ||
mov %rax, %r9 | ||
|
||
|
||
mov %rdx, %rax | ||
mulq $9 | ||
popf | ||
adc $$0, %rax | ||
adc $8, %rdx | ||
pushf | ||
mov %rax, %r10 | ||
|
||
mov %rdx, %rax | ||
mulq $9 | ||
popf | ||
adc $$0, %rax | ||
mov %rax, %r11 | ||
mov %rdx, %rcx | ||
|
||
mov $5, %rax | ||
mulq $10 | ||
adc %rax, %r9 | ||
adc $6, %rdx | ||
pushf | ||
|
||
mov %rdx, %rax | ||
mulq $10 | ||
popf | ||
adc %rax, %r10 | ||
adc $7, %rdx | ||
pushf | ||
|
||
mov %rdx, %rax | ||
mulq $10 | ||
popf | ||
adc %rax, %r11 | ||
pushf | ||
or %rax, %rcx | ||
|
||
mov $5, %rax | ||
mulq $11 | ||
popf | ||
adc %rax, %r10 | ||
adc $6, %rdx | ||
pushf | ||
|
||
mov %rdx, %rax | ||
mulq $11 | ||
popf | ||
adc %rax, %r11 | ||
pushf | ||
or %rdx, %rcx | ||
|
||
mov $5, %rax | ||
mulq $12 | ||
popf | ||
adc %rax, %r11 | ||
or %rdx, %rcx | ||
" | ||
: /* $0 */ "={r8}"(result[0]), /* $1 */ "={r9}"(result[1]), /* $2 */ "={r10}"(result[2]), | ||
/* $3 */ "={r11}"(result[3]), /* $4 */ "={rcx}"(overflow) | ||
|
||
: /* $5 */ "m"(self_t[0]), /* $6 */ "m"(self_t[1]), /* $7 */ "m"(self_t[2]), | ||
/* $8 */ "m"(self_t[3]), /* $9 */ "m"(other_t[0]), /* $10 */ "m"(other_t[1]), | ||
/* $11 */ "m"(other_t[2]), /* $12 */ "m"(other_t[3]) | ||
: "rax", "rdx" | ||
: | ||
|
||
); | ||
} | ||
(U256(result), overflow > 0) | ||
}); | ||
($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => ( | ||
uint_overflowing_mul_reg!($name, $n_words, $self_expr, $other) | ||
) | ||
} | ||
|
||
#[cfg(not(all(feature="dev", target_arch = "x86_64")))] | ||
macro_rules! uint_overflowing_mul { | ||
($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
uint_overflowing_mul_reg!($name, $n_words, $self_expr, $other) | ||
}) | ||
} | ||
|
||
macro_rules! uint_overflowing_mul_reg { | ||
($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({ | ||
let mut res = $name::from(0u64); | ||
let mut overflow = false; | ||
// TODO: be more efficient about this | ||
for i in 0..(2 * $n_words) { | ||
let v = overflowing!($self_expr.overflowing_mul_u32(($other >> (32 * i)).low_u32()), overflow); | ||
let res2 = overflowing!(v.overflowing_shl(32 * i as u32), overflow); | ||
res = overflowing!(res.overflowing_add(res2), overflow); | ||
} | ||
(res, overflow) | ||
}) | ||
} | ||
|
||
macro_rules! overflowing { | ||
($op: expr, $overflow: expr) => ( | ||
{ | ||
|
@@ -297,50 +523,20 @@ macro_rules! construct_uint { | |
(res, overflow) | ||
} | ||
|
||
/// Optimized instructions | ||
#[inline(always)] | ||
fn overflowing_add(self, other: $name) -> ($name, bool) { | ||
let $name(ref me) = self; | ||
let $name(ref you) = other; | ||
let mut ret = [0u64; $n_words]; | ||
let mut carry = [0u64; $n_words]; | ||
let mut b_carry = false; | ||
let mut overflow = false; | ||
|
||
for i in 0..$n_words { | ||
ret[i] = me[i].wrapping_add(you[i]); | ||
|
||
if ret[i] < me[i] { | ||
if i < $n_words - 1 { | ||
carry[i + 1] = 1; | ||
b_carry = true; | ||
} else { | ||
overflow = true; | ||
} | ||
} | ||
} | ||
if b_carry { | ||
let ret = overflowing!($name(ret).overflowing_add($name(carry)), overflow); | ||
(ret, overflow) | ||
} else { | ||
($name(ret), overflow) | ||
} | ||
uint_overflowing_add!($name, $n_words, self, other) | ||
} | ||
|
||
#[inline(always)] | ||
fn overflowing_sub(self, other: $name) -> ($name, bool) { | ||
let res = overflowing!((!other).overflowing_add(From::from(1u64))); | ||
let res = overflowing!(self.overflowing_add(res)); | ||
(res, self < other) | ||
uint_overflowing_sub!($name, $n_words, self, other) | ||
} | ||
|
||
#[inline(always)] | ||
fn overflowing_mul(self, other: $name) -> ($name, bool) { | ||
let mut res = $name::from(0u64); | ||
let mut overflow = false; | ||
// TODO: be more efficient about this | ||
for i in 0..(2 * $n_words) { | ||
let v = overflowing!(self.overflowing_mul_u32((other >> (32 * i)).low_u32()), overflow); | ||
let res2 = overflowing!(v.overflowing_shl(32 * i as u32), overflow); | ||
res = overflowing!(res.overflowing_add(res2), overflow); | ||
} | ||
(res, overflow) | ||
uint_overflowing_mul!($name, $n_words, self, other) | ||
} | ||
|
||
fn overflowing_div(self, other: $name) -> ($name, bool) { | ||
|
@@ -1171,8 +1367,6 @@ mod tests { | |
); | ||
} | ||
|
||
|
||
|
||
#[test] | ||
#[should_panic] | ||
pub fn uint256_mul_overflow_panic() { | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
benchmark is not very accurate having a call to
U256::from
on each iteration.