Skip to content

Commit

Permalink
Support lz4 block format (#53)
Browse files Browse the repository at this point in the history
  • Loading branch information
milesgranger authored Apr 14, 2021
1 parent cb0dd0e commit 484cddd
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cramjam"
version = "2.2.0"
version = "2.3.0"
authors = ["Miles Granger <[email protected]>"]
edition = "2018"
license-file = "LICENSE"
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
BASE_BENCH_CMD = python -m pytest -v --benchmark-sort name --benchmark-only benchmarks/ -k

test:
python -m pytest -v --ignore benchmarks
python -m pytest tests -v --ignore benchmarks

bench:
python -m pytest -v --benchmark-only --benchmark-sort name benchmarks/
Expand All @@ -18,6 +18,9 @@ bench-snappy-compress-into:
bench-lz4:
$(BASE_BENCH_CMD) lz4

bench-lz4-block:
$(BASE_BENCH_CMD) lz4_block

bench-gzip:
$(BASE_BENCH_CMD) gzip

Expand Down
39 changes: 39 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,45 @@ test_lz4[urls.10K-python-lz4] 10,320.9730 (42.07)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```

#### LZ4 _block_ format

`make bench-lz4-block`

```bash
---------------------------------------------------------------------------------------------------------------- benchmark: 28 tests -----------------------------------------------------------------------------------------------------------------
Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_lz4_block[Mark.Twain-Tom.Sawyer.txt-cramjam] 40.9340 (1.37) 184.3430 (1.19) 42.9931 (1.35) 4.6601 (1.0) 41.5510 (1.37) 0.3530 (1.92) 1034;1427 23,259.5474 (0.74) 9422 1
test_lz4_block[Mark.Twain-Tom.Sawyer.txt-python-lz4] 38.4370 (1.28) 154.2830 (1.0) 40.6069 (1.28) 5.5279 (1.19) 38.9110 (1.28) 0.4095 (2.23) 1107;2431 24,626.3528 (0.78) 10544 1
test_lz4_block[alice29.txt-cramjam] 472.0840 (15.77) 713.6530 (4.63) 494.8957 (15.56) 25.8485 (5.55) 493.4380 (16.27) 31.5625 (171.54) 190;53 2,020.6276 (0.06) 1839 1
test_lz4_block[alice29.txt-python-lz4] 513.4790 (17.15) 857.2650 (5.56) 537.8642 (16.91) 27.2345 (5.84) 535.3320 (17.65) 34.5312 (187.68) 215;47 1,859.2053 (0.06) 1863 1
test_lz4_block[asyoulik.txt-cramjam] 380.3780 (12.71) 635.7260 (4.12) 401.1980 (12.61) 28.4094 (6.10) 393.5930 (12.98) 23.0175 (125.10) 193;141 2,492.5350 (0.08) 2096 1
test_lz4_block[asyoulik.txt-python-lz4] 416.9540 (13.93) 734.8450 (4.76) 437.0448 (13.74) 23.5893 (5.06) 437.7690 (14.44) 24.1480 (131.24) 251;108 2,288.0950 (0.07) 2290 1
test_lz4_block[fifty-four-mb-random-cramjam] 19,617.3530 (655.40) 20,615.6860 (133.62) 19,903.2510 (625.72) 196.4017 (42.15) 19,867.1210 (655.12) 183.0970 (995.12) 14;3 50.2430 (0.00) 50 1
test_lz4_block[fifty-four-mb-random-python-lz4] 115,706.2050 (>1000.0) 116,848.0440 (757.36) 116,398.5400 (>1000.0) 421.1026 (90.36) 116,555.1350 (>1000.0) 572.2738 (>1000.0) 3;0 8.5912 (0.00) 9 1
test_lz4_block[fifty-four-mb-repeating-cramjam] 16,384.0190 (547.37) 34,818.2670 (225.68) 17,412.4441 (547.42) 3,003.0026 (644.41) 16,854.9620 (555.79) 258.3560 (>1000.0) 1;4 57.4302 (0.00) 36 1
test_lz4_block[fifty-four-mb-repeating-python-lz4] 58,464.7160 (>1000.0) 61,384.1530 (397.87) 59,202.9512 (>1000.0) 776.3388 (166.59) 59,151.1860 (>1000.0) 644.4060 (>1000.0) 2;2 16.8910 (0.00) 17 1
test_lz4_block[fireworks.jpeg-cramjam] 29.9320 (1.0) 161.8750 (1.05) 31.8084 (1.0) 5.0188 (1.08) 30.3260 (1.0) 0.1840 (1.0) 1946;3770 31,438.2794 (1.0) 18014 1
test_lz4_block[fireworks.jpeg-python-lz4] 34.4750 (1.15) 169.8460 (1.10) 36.7064 (1.15) 5.2782 (1.13) 35.1650 (1.16) 0.8880 (4.83) 1940;2096 27,243.1909 (0.87) 17242 1
test_lz4_block[geo.protodata-cramjam] 140.1360 (4.68) 291.7290 (1.89) 147.5534 (4.64) 12.0341 (2.58) 141.8530 (4.68) 9.3140 (50.62) 990;375 6,777.2052 (0.22) 5938 1
test_lz4_block[geo.protodata-python-lz4] 144.4831 (4.83) 350.4770 (2.27) 152.2133 (4.79) 12.8940 (2.77) 146.0120 (4.81) 10.0460 (54.60) 1043;347 6,569.7283 (0.21) 5972 1
test_lz4_block[html-cramjam] 142.9390 (4.78) 277.8630 (1.80) 149.9061 (4.71) 11.7107 (2.51) 144.4170 (4.76) 9.1133 (49.53) 846;321 6,670.8442 (0.21) 5319 1
test_lz4_block[html-python-lz4] 147.3250 (4.92) 279.1871 (1.81) 154.4610 (4.86) 11.5615 (2.48) 149.0470 (4.91) 9.1137 (49.53) 994;375 6,474.1270 (0.21) 6013 1
test_lz4_block[html_x_4-cramjam] 600.2710 (20.05) 941.8120 (6.10) 628.8270 (19.77) 29.0365 (6.23) 623.0760 (20.55) 39.2540 (213.34) 186;28 1,590.2625 (0.05) 1491 1
test_lz4_block[html_x_4-python-lz4] 625.6220 (20.90) 930.9070 (6.03) 655.2924 (20.60) 30.3427 (6.51) 648.3260 (21.38) 39.3870 (214.07) 196;40 1,526.0362 (0.05) 1448 1
test_lz4_block[kppkn.gtb-cramjam] 446.5680 (14.92) 729.3510 (4.73) 466.9985 (14.68) 24.7114 (5.30) 467.1435 (15.40) 27.8070 (151.13) 217;64 2,141.3345 (0.07) 2110 1
test_lz4_block[kppkn.gtb-python-lz4] 457.6420 (15.29) 673.1950 (4.36) 477.8109 (15.02) 23.3480 (5.01) 473.5590 (15.62) 26.9885 (146.68) 239;72 2,092.8781 (0.07) 2073 1
test_lz4_block[lcet10.txt-cramjam] 1,300.3600 (43.44) 1,639.6500 (10.63) 1,358.7622 (42.72) 44.2987 (9.51) 1,350.1540 (44.52) 57.7395 (313.81) 228;11 735.9640 (0.02) 720 1
test_lz4_block[lcet10.txt-python-lz4] 1,410.9100 (47.14) 1,763.4770 (11.43) 1,474.0569 (46.34) 48.0336 (10.31) 1,465.7810 (48.33) 60.8988 (330.98) 176;13 678.3999 (0.02) 621 1
test_lz4_block[paper-100k.pdf-cramjam] 75.7430 (2.53) 213.3430 (1.38) 80.0105 (2.52) 8.9087 (1.91) 76.5580 (2.52) 2.2090 (12.01) 848;1656 12,498.3609 (0.40) 8862 1
test_lz4_block[paper-100k.pdf-python-lz4] 82.7410 (2.76) 217.1710 (1.41) 87.4508 (2.75) 9.4264 (2.02) 83.8570 (2.77) 0.8590 (4.67) 905;2221 11,435.0004 (0.36) 9246 1
test_lz4_block[plrabn12.txt-cramjam] 1,527.4300 (51.03) 1,917.3070 (12.43) 1,594.6245 (50.13) 55.9814 (12.01) 1,584.3080 (52.24) 64.3090 (349.52) 125;16 627.1069 (0.02) 511 1
test_lz4_block[plrabn12.txt-python-lz4] 2,011.2020 (67.19) 2,540.4010 (16.47) 2,123.5693 (66.76) 93.4244 (20.05) 2,098.7020 (69.20) 111.1333 (604.00) 105;22 470.9053 (0.01) 459 1
test_lz4_block[urls.10K-cramjam] 1,511.1370 (50.49) 2,008.8290 (13.02) 1,591.5999 (50.04) 65.2772 (14.01) 1,578.4930 (52.05) 61.4250 (333.84) 120;29 628.2986 (0.02) 511 1
test_lz4_block[urls.10K-python-lz4] 2,001.9611 (66.88) 2,490.6120 (16.14) 2,103.2583 (66.12) 66.4167 (14.25) 2,090.2720 (68.93) 78.7600 (428.06) 111;12 475.4528 (0.02) 427 1
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```

#### Brotli

`make bench-brotli`
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/test_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,27 @@ def test_lz4(benchmark, file, use_cramjam: bool):
)


@pytest.mark.parametrize(
"use_cramjam", (True, False), ids=lambda val: "cramjam" if val else "python-lz4"
)
@pytest.mark.parametrize("file", FILES, ids=lambda val: val.name)
def test_lz4_block(benchmark, file, use_cramjam: bool):
from lz4 import block

data = file.read_bytes()
if use_cramjam:
benchmark(
round_trip,
compress=cramjam.lz4.compress_block,
decompress=cramjam.lz4.decompress_block,
data=data,
)
else:
benchmark(
round_trip, compress=block.compress, decompress=block.decompress, data=data,
)


@pytest.mark.parametrize(
"use_cramjam", (True, False), ids=lambda val: "cramjam" if val else "brotli"
)
Expand Down
64 changes: 63 additions & 1 deletion src/lz4.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! lz4 de/compression interface
use crate::exceptions::{CompressionError, DecompressionError};
use crate::io::RustyBuffer;
use crate::io::{AsBytes, RustyBuffer};
use crate::{to_py_err, BytesType};
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
Expand All @@ -10,6 +10,8 @@ use std::io::Cursor;
pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(compress, m)?)?;
m.add_function(wrap_pyfunction!(decompress, m)?)?;
m.add_function(wrap_pyfunction!(compress_block, m)?)?;
m.add_function(wrap_pyfunction!(decompress_block, m)?)?;
m.add_function(wrap_pyfunction!(compress_into, m)?)?;
m.add_function(wrap_pyfunction!(decompress_into, m)?)?;
Ok(())
Expand Down Expand Up @@ -55,6 +57,66 @@ pub fn decompress_into(input: BytesType, mut output: BytesType) -> PyResult<usiz
Ok(r)
}

/// LZ4 _block_ decompression.
///
/// `output_len` is optional, it's the upper bound length of decompressed data; if it's not provided,
/// then it's assumed `store_size=True` was used during compression and length will then be taken
/// from the header.
///
/// Python Example
/// --------------
/// ```python
/// >>> cramjam.lz4.decompress_block(compressed_bytes, output_len=Optional[int])
/// ```
#[pyfunction]
pub fn decompress_block(data: BytesType, output_len: Option<usize>) -> PyResult<RustyBuffer> {
use lz4::block;
let out = to_py_err!(DecompressionError -> block::decompress(data.as_bytes(), output_len.map(|v| v as i32)))?;
Ok(RustyBuffer::from(out))
}

/// lZ4 _block_ compression.
///
/// The kwargs mostly follow the same definition found in [python-lz4 block.compress](https://python-lz4.readthedocs.io/en/stable/lz4.block.html#module-lz4.block)
///
/// Python Example
/// --------------
/// ```python
/// >>> cramjam.lz4.compress_block(
/// ... b'some bytes here',
/// ... output_len=Optional[int],
/// ... mode=Option[str],
/// ... acceleration=Option[int],
/// ... compression=Option[int],
/// ... store_size=Option[bool]
/// ... )
/// ```
#[pyfunction]
#[allow(unused_variables)]
pub fn compress_block(
data: BytesType,
output_len: Option<usize>,
mode: Option<&str>,
acceleration: Option<i32>,
compression: Option<i32>,
store_size: Option<bool>,
) -> PyResult<RustyBuffer> {
use lz4::{block, block::CompressionMode};

let store_size = store_size.unwrap_or(true);
let mode = match mode {
Some(m) => match m {
"default" => CompressionMode::DEFAULT,
"fast" => CompressionMode::FAST(acceleration.unwrap_or(1)),
"high_compression" => CompressionMode::HIGHCOMPRESSION(compression.unwrap_or(9)),
_ => return Err(DecompressionError::new_err(format!("Unrecognized mode '{}'", m))),
},
None => CompressionMode::DEFAULT,
};
let out = to_py_err!(CompressionError -> block::compress(data.as_bytes(), Some(mode), store_size))?;
Ok(RustyBuffer::from(out))
}

pub(crate) mod internal {
use lz4::{Decoder, EncoderBuilder};
use std::io::{Error, Read, Seek, SeekFrom, Write};
Expand Down
Empty file added tests/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions tests/test_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,34 @@ def test_dunders(Obj, tmpdir):
assert "len=5" in str(obj)
if isinstance(obj, cramjam.File):
assert f"path={path}" in str(obj)


@pytest.mark.parametrize(
"compress_kwargs",
(
dict(mode="default", acceleration=1, compression=1, store_size=True),
dict(mode="fast", acceleration=2, compression=2, store_size=False),
dict(mode="high_compression", acceleration=3, compression=3, store_size=True),
dict(mode="default", acceleration=5, compression=4, store_size=False),
),
)
def test_lz4_block(compress_kwargs):

from cramjam import lz4

data = b"howdy neighbor"

# What python-lz4 outputs in block mode
expected = b"\x0e\x00\x00\x00\xe0howdy neighbor"
assert bytes(lz4.compress_block(data)) == expected

# and what it does without 'store_size=True'
expected = b"\xe0howdy neighbor"
assert bytes(lz4.compress_block(data, store_size=False)) == expected

# Round trip the current collection of compression kwargs
out = lz4.decompress_block(
lz4.compress_block(data, **compress_kwargs),
output_len=len(data) if not compress_kwargs["store_size"] else None,
)
assert bytes(out) == data

0 comments on commit 484cddd

Please sign in to comment.