diff --git a/.github/workflows/bench.yaml b/.github/workflows/bench.yaml new file mode 100644 index 0000000..82b0849 --- /dev/null +++ b/.github/workflows/bench.yaml @@ -0,0 +1,27 @@ +name: Run Benchmarks +on: pull_request +permissions: + deployments: write + contents: write + pull-requests: write + +jobs: + benchmark: + name: Benchmark + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@1ff72ee08e3cb84d84adba594e0a297990fc1ed3 # stable + - uses: Swatinem/rust-cache@f0deed1e0edfc6a9be95417288c0e1099b1eeec3 # v2.7.7 + - run: cargo install cargo-criterion + - run: cargo criterion --output-format bencher 2>&1 | tee output.txt + - uses: benchmark-action/github-action-benchmark@d48d326b4ca9ba73ca0cd0d59f108f9e02a381c7 # v1.20.4 + with: + name: Rust Benchmark + tool: "cargo" + output-file-path: output.txt + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: "120%" + comment-always: true + fail-on-alert: true diff --git a/Cargo.lock b/Cargo.lock index 0ce8ead..d05fe53 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,6 +35,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -193,6 +199,17 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "half" version = "2.4.1" @@ -252,7 +269,7 @@ dependencies = [ "criterion", "csv", "memchr", - "memmap2", + "rand", "thiserror", ] @@ -274,15 +291,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memmap2" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" -dependencies = [ - "libc", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -332,6 +340,15 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.93" @@ -350,6 +367,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.10.0" @@ -509,6 +556,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -667,3 +720,24 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index eddbcf4..9fc265a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ thiserror = { version = "2", default-features = false } [dev-dependencies] criterion = "0.5.1" csv = "1.3.1" -memmap2 = "0.9.5" +rand = "0.8.5" [[bench]] name = "single_thread" diff --git a/benches/single_thread.rs b/benches/single_thread.rs index d990b51..04c0a43 100644 --- a/benches/single_thread.rs +++ b/benches/single_thread.rs @@ -1,20 +1,42 @@ -use std::{fs::File, hint::black_box, io::Cursor}; +use std::{hint::black_box, io::Cursor}; -use criterion::{criterion_group, criterion_main, Bencher, BenchmarkId, Criterion}; +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; use lazycsv::{Csv, CsvIterItem}; -use memchr::memchr_iter; -use memmap2::Mmap; - -fn prepare(rows: usize) -> Vec { - let f = File::open(std::env::var("INPUT").unwrap()).unwrap(); - let mmap = unsafe { Mmap::map(&f).unwrap() }; - let mut lf_iter = memchr_iter(b'\n', &mmap); - let second_lf = lf_iter.nth(1).unwrap(); - let ending_lf = lf_iter.nth(rows).unwrap(); - let range = (second_lf + 1)..ending_lf; - let mut vec = Vec::with_capacity(range.len()); - vec.extend_from_slice(&mmap[range]); - vec +use rand::{Rng, SeedableRng as _}; + +const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\","; +const ROWS: usize = 100_000; +const COLS: usize = 30; +const MIN_CHARS: usize = 3; +const MAX_CHARS: usize = 100; + +fn gen_random_str(rng: &mut T) -> String { + let content: String = (0..rng.gen_range(MIN_CHARS..MAX_CHARS)) + .map(|_| CHARS[rng.gen_range(0..CHARS.len())] as char) + .collect(); + + if content.contains(',') || content.contains('"') { + format!("\"{}\"", content.replace("\"", "\"\"")) + } else { + content + } +} + +fn prepare() -> Vec { + let mut buf = Vec::with_capacity(ROWS * COLS * ((MAX_CHARS - MIN_CHARS) / 2 + MIN_CHARS)); + + let mut rng = rand::rngs::StdRng::from_seed(b"f3a90c67b3ca86afd62658c1b30f1f12".to_owned()); + for _ in 0..ROWS { + for col in 0..COLS { + buf.extend_from_slice(gen_random_str(&mut rng).as_bytes()); + if col != 29 { + buf.push(b','); + } + } + buf.push(b'\n'); + } + + buf } pub fn lazy_csv(b: &mut Bencher, slice: &[u8]) { @@ -29,7 +51,7 @@ pub fn lazy_csv(b: &mut Bencher, slice: &[u8]) { pub fn lazy_csv_into_rows(b: &mut Bencher, slice: &[u8]) { b.iter(|| { - for row in Csv::new(slice).into_rows::<28>() { + for row in Csv::new(slice).into_rows::() { for cell in row.unwrap() { black_box(cell.try_as_str().unwrap()); } @@ -47,7 +69,7 @@ pub fn lazy_csv_raw(b: &mut Bencher, slice: &[u8]) { pub fn lazy_csv_into_rows_raw(b: &mut Bencher, slice: &[u8]) { b.iter(|| { - for row in Csv::new(slice).into_rows::<28>() { + for row in Csv::new(slice).into_rows::() { for cell in row.unwrap() { black_box(cell); } @@ -72,23 +94,21 @@ pub fn csv(b: &mut Bencher, slice: &[u8]) { fn bench_parsers(c: &mut Criterion) { let mut group = c.benchmark_group("Parsers"); - for i in [1_000, 10_000, 50_000, 100_000] { - group.bench_with_input(BenchmarkId::new("lazy_csv", i), &i, |b, i| { - lazy_csv(b, &prepare(*i)) - }); - group.bench_with_input(BenchmarkId::new("lazy_csv (into_rows)", i), &i, |b, i| { - lazy_csv_into_rows(b, &prepare(*i)) - }); - group.bench_with_input(BenchmarkId::new("lazy_csv (raw)", i), &i, |b, i| { - lazy_csv_raw(b, &prepare(*i)) - }); - group.bench_with_input( - BenchmarkId::new("lazy_csv (into_rows, raw)", i), - &i, - |b, i| lazy_csv_into_rows_raw(b, &prepare(*i)), - ); - group.bench_with_input(BenchmarkId::new("csv", i), &i, |b, i| csv(b, &prepare(*i))); - } + + group.sample_size(50); + + let buf = prepare(); + group.bench_with_input("lazy_csv", &buf.clone(), |b, buf| lazy_csv(b, buf)); + group.bench_with_input("lazy_csv (into_rows)", &buf.clone(), |b, buf| { + lazy_csv_into_rows(b, buf) + }); + group.bench_with_input("lazy_csv (raw)", &buf.clone(), |b, buf| { + lazy_csv_raw(b, buf) + }); + group.bench_with_input("lazy_csv (into_rows, raw)", &buf.clone(), |b, buf| { + lazy_csv_into_rows_raw(b, buf) + }); + group.bench_with_input("csv", &buf.clone(), |b, buf| csv(b, buf)); group.finish(); }