diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 87b91af6..899112d1 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -26,9 +26,9 @@ jobs: profile: minimal default: true - name: Build - run: cargo build + run: cargo build --release - name: Tests - run: cargo test + run: cargo test --no-default-features --release - name: Install maturin run: pip install maturin - name: Build wheels - x86_64 @@ -78,10 +78,10 @@ jobs: default: true - name: Build if: matrix.platform.python-architecture == 'x64' - run: cargo build + run: cargo build --release - name: Tests if: matrix.platform.python-architecture == 'x64' - run: cargo test + run: cargo test --no-default-features --release - name: Install maturin run: pip install maturin - name: Build wheels @@ -115,9 +115,9 @@ jobs: profile: minimal default: true - name: Build - run: cargo build + run: cargo build --release - name: Tests - run: cargo test + run: cargo test --no-default-features - uses: actions/setup-python@v2 with: python-version: 3.6 @@ -129,7 +129,6 @@ jobs: pip install maturin maturin build -i python --release --out dist --no-sdist --target ${{ matrix.platform.target }} --manylinux ${{ matrix.platform.manylinux }} ' > build-wheel.sh - chmod +x build-wheel.sh docker run --rm -v "$PWD":/io -w /io quay.io/pypa/manylinux${{ matrix.platform.manylinux }}_${{ matrix.platform.arch }} bash build-wheel.sh - name: Python UnitTest @@ -145,7 +144,7 @@ jobs: path: dist linux-cross: - runs-on: ubuntu-16.04 + runs-on: ubuntu-latest strategy: matrix: platform: [ @@ -157,32 +156,15 @@ jobs: - uses: actions/setup-python@v2 with: python-version: 3.6 - - name: Install Rust toolchain - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - target: ${{ matrix.platform.target }} - profile: minimal - default: true - - name: Install aarch64 cross compiler - if: matrix.platform.target == 'aarch64-unknown-linux-gnu' - run: | - sudo apt-get install -y gcc-aarch64-linux-gnu libc6-arm64-cross libc6-dev-arm64-cross - echo "TARGET_CC=aarch64-linux-gnu-gcc" >> "$GITHUB_ENV" - echo "TARGET_CXX=aarch64-linux-gnu-cpp" >> "$GITHUB_ENV" - echo "CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc" >> "$GITHUB_ENV" - - name: Install armv7 cross compiler - if: matrix.platform.target == 'armv7-unknown-linux-gnueabihf' - run: | - sudo apt-get install -y gcc-arm-linux-gnueabihf libc6-armhf-cross libc6-dev-armhf-cross - echo "TARGET_CC=arm-linux-gnueabihf-gcc" >> "$GITHUB_ENV" - echo "TARGET_CXX=arm-linux-gnueabihf-cpp" >> "$GITHUB_ENV" - echo "CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc" >> "$GITHUB_ENV" - - name: Install maturin - run: pip install maturin - name: Build Wheels run: | - maturin build -i python --release --out dist --no-sdist --target ${{ matrix.platform.target }} --manylinux ${{ matrix.platform.manylinux }} --cargo-extra-args="--no-default-features" --cargo-extra-args="--features=abi3" + echo 'curl -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable + source ~/.cargo/env + rustup target add ${{ matrix.platform.target }} + maturin build -i python --release --out dist --no-sdist --target ${{ matrix.platform.target }} --manylinux ${{ matrix.platform.manylinux }} --cargo-extra-args="--no-default-features" --cargo-extra-args="--features=abi3,extension-module" # disable mimallocator + ' > build-wheel.sh + + docker run --rm -v "$PWD":/io -w /io messense/manylinux2014-cross:${{ matrix.platform.arch }} bash build-wheel.sh - uses: uraimo/run-on-arch-action@v2.0.5 name: Install built wheel with: @@ -220,9 +202,9 @@ jobs: profile: minimal default: true - name: Build - run: cargo build + run: cargo build --release - name: Tests - run: cargo test + run: cargo test --no-default-features - uses: actions/setup-python@v2 with: python-version: pypy-3.6 diff --git a/Cargo.toml b/Cargo.toml index cb38e687..ffab3274 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cramjam" -version = "2.0.2" +version = "2.1.0" authors = ["Miles Granger "] edition = "2018" license-file = "LICENSE" @@ -12,9 +12,10 @@ readme = "README.md" crate-type = ["cdylib"] [features] -default = ["abi3", "mimallocator"] +default = ["abi3", "mimallocator", "extension-module"] abi3 = ["pyo3/abi3-py36"] mimallocator = ["mimalloc"] +extension-module = ["pyo3/extension-module"] [profile.release] lto = "fat" @@ -22,10 +23,10 @@ codegen-units = 1 opt-level = 3 [dependencies] -pyo3 = { version = "0.13.2", features = ["extension-module"] } +pyo3 = { version = "0.13.2", default-features = false, features = ["macros"] } snap = "^1" brotli2 = "^0.3" -lz-fear = "0.1.1" +lz4 = "^1" flate2 = "^1" zstd = "0.6.0+zstd.1.4.8" numpy = "0.13.0" diff --git a/Makefile b/Makefile index f807d388..9923a3bb 100644 --- a/Makefile +++ b/Makefile @@ -34,5 +34,5 @@ dev-install: pip install cramjam --no-index --find-links dist/ pypy-build: - maturin build -i $(shell which pypy) --release --out dist --cargo-extra-args="--no-default-features" # disable abi3 + maturin build -i $(shell which pypy) --release --out dist --cargo-extra-args="--no-default-features" --cargo-extra-args="--features=mimallocator,extension-module" # disable abi3 pypy ./pypy_patch.py diff --git a/README.md b/README.md index bc0de493..f35488a0 100644 --- a/README.md +++ b/README.md @@ -45,29 +45,42 @@ All available for use as: b"bytes here" ``` -Where the API is `cramjam..compress/decompress` and accepts -both `bytes` and `bytearray` objects. +Where the API is `cramjam..compress/decompress` and accepts +`bytes`/`bytearray`/`numpy.array`/`cramjam.File`/`cramjam.Buffer` objects. **de/compress_into** -Additionally, all variants except for lz4, support `decompress_into` and `compress_into`. -If you have a numpy array preallocated, that can be used as the output location for de/compression. +Additionally, all variants support `decompress_into` and `compress_into`. Ex. -```python ->>> from cramjam import snappy +```python >>> import numpy as np ->>> compressed_data # some data that we know the size of when decompressed ->>> output = np.zeros(<>, dtype=np.uint8) ->>> snappy.decompress_into(compressed_data, output) # returns number of bytes decoded -<> +>>> from cramjam import snappy, Buffer +>>> +>>> data = np.frombuffer(b'some bytes here', dtype=np.uint8) +>>> data +array([115, 111, 109, 101, 32, 98, 121, 116, 101, 115, 32, 104, 101, + 114, 101], dtype=uint8) +>>> +>>> compressed = Buffer() +>>> snappy.compress_into(data, compressed) +33 # 33 bytes written to compressed buffer +>>> +>>> compressed.tell() # Where is the buffer position? +33 # goodie! +>>> +>>> compressed.seek(0) # Go back to the start of the buffer so we can prepare to decompress +>>> decompressed = b'0' * len(data) # let's write to `bytes` as output +>>> decompressed +b'000000000000000' +>>> +>>> snappy.decompress_into(compressed, decompressed) +15 # 15 bytes written to decompressed +>>> decompressed +b'some bytes here' ``` -This is very fast, as it avoids any buffer allocations on the rust side. **Special note!** If you know the length of the de/compress output, you can provide `output_len=<>` to any `de/compress` to get ~1.5-3x performance increase as this allows single -buffer allocation. - -For `snappy` with `bytearray`s, it's only a mild improvement -as we currently are able to estimate the buffer size and can -resize the resulting `bytearray` to the correct size. +buffer allocation; doesn't really apply if you're using `cramjam.Buffer` +or `cramjam.File` objects. diff --git a/src/brotli.rs b/src/brotli.rs index 61dc44fd..53186d07 100644 --- a/src/brotli.rs +++ b/src/brotli.rs @@ -1,13 +1,13 @@ +//! brotli de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; -use crate::{to_py_err, BytesType, WriteablePyByteArray}; -use numpy::PyArray1; +use crate::{to_py_err, BytesType}; use pyo3::prelude::*; use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; m.add_function(wrap_pyfunction!(compress_into, m)?)?; @@ -48,17 +48,19 @@ pub fn compress<'a>( #[pyfunction] pub fn compress_into<'a>( _py: Python<'a>, - data: BytesType<'a>, - array: &PyArray1, + input: BytesType<'a>, + mut output: BytesType<'a>, level: Option, ) -> PyResult { - crate::generic_into!(compress(data -> array), level) + let r = internal::compress(input, &mut output, level)?; + Ok(r) } /// Decompress directly into an output buffer #[pyfunction] -pub fn decompress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &'a PyArray1) -> PyResult { - crate::generic_into!(decompress(data -> array)) +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r) } pub(crate) mod internal { @@ -68,14 +70,14 @@ pub(crate) mod internal { use std::io::Error; /// Decompress via Brotli - pub fn decompress(input: &[u8], output: &mut W) -> Result { + pub fn decompress(input: R, output: &mut W) -> Result { let mut decoder = BrotliDecoder::new(input); let n_bytes = std::io::copy(&mut decoder, output)?; Ok(n_bytes as usize) } /// Compress via Brotli - pub fn compress(input: &[u8], output: &mut W, level: Option) -> Result { + pub fn compress(input: R, output: &mut W, level: Option) -> Result { let level = level.unwrap_or_else(|| 11); let mut encoder = BrotliEncoder::new(input, level); let n_bytes = std::io::copy(&mut encoder, output)?; diff --git a/src/deflate.rs b/src/deflate.rs index c0a2f5df..6484b6fa 100644 --- a/src/deflate.rs +++ b/src/deflate.rs @@ -1,13 +1,13 @@ +//! deflate de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; -use crate::{to_py_err, BytesType, WriteablePyByteArray}; -use numpy::PyArray1; +use crate::{to_py_err, BytesType}; use pyo3::prelude::*; use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; m.add_function(wrap_pyfunction!(compress_into, m)?)?; @@ -48,17 +48,19 @@ pub fn compress<'a>( #[pyfunction] pub fn compress_into<'a>( _py: Python<'a>, - data: BytesType<'a>, - array: &PyArray1, + input: BytesType<'a>, + mut output: BytesType<'a>, level: Option, ) -> PyResult { - crate::generic_into!(compress(data -> array), level) + let r = internal::compress(input, &mut output, level)?; + Ok(r) } /// Decompress directly into an output buffer #[pyfunction] -pub fn decompress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &'a PyArray1) -> PyResult { - crate::generic_into!(decompress(data -> array)) +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r) } pub(crate) mod internal { @@ -69,14 +71,14 @@ pub(crate) mod internal { use std::io::Error; /// Decompress gzip data - pub fn decompress(input: &[u8], output: &mut W) -> Result { + pub fn decompress(input: R, output: &mut W) -> Result { let mut decoder = DeflateDecoder::new(input); let n_bytes = std::io::copy(&mut decoder, output)?; Ok(n_bytes as usize) } /// Compress gzip data - pub fn compress(input: &[u8], output: &mut W, level: Option) -> Result { + pub fn compress(input: R, output: &mut W, level: Option) -> Result { let level = level.unwrap_or_else(|| 6); let mut encoder = DeflateEncoder::new(input, Compression::new(level)); diff --git a/src/exceptions.rs b/src/exceptions.rs index 2ef1d9c9..f9870d3b 100644 --- a/src/exceptions.rs +++ b/src/exceptions.rs @@ -1,3 +1,5 @@ +#![allow(missing_docs)] +//! cramjam specific Python exceptions use pyo3::create_exception; use pyo3::exceptions::PyException; diff --git a/src/gzip.rs b/src/gzip.rs index 381fb093..db951cbe 100644 --- a/src/gzip.rs +++ b/src/gzip.rs @@ -1,13 +1,13 @@ +//! gzip de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; -use crate::{to_py_err, BytesType, WriteablePyByteArray}; -use numpy::PyArray1; +use crate::{to_py_err, BytesType}; use pyo3::prelude::*; use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; m.add_function(wrap_pyfunction!(compress_into, m)?)?; @@ -48,17 +48,19 @@ pub fn compress<'a>( #[pyfunction] pub fn compress_into<'a>( _py: Python<'a>, - data: BytesType<'a>, - array: &PyArray1, + input: BytesType<'a>, + mut output: BytesType<'a>, level: Option, ) -> PyResult { - crate::generic_into!(compress(data -> array), level) + let r = internal::compress(input, &mut output, level)?; + Ok(r) } /// Decompress directly into an output buffer #[pyfunction] -pub fn decompress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &'a PyArray1) -> PyResult { - crate::generic_into!(decompress(data -> array)) +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r) } pub(crate) mod internal { @@ -68,14 +70,14 @@ pub(crate) mod internal { use std::io::Error; /// Decompress gzip data - pub fn decompress(input: &[u8], output: &mut W) -> Result { + pub fn decompress(input: R, output: &mut W) -> Result { let mut decoder = GzDecoder::new(input); let n_bytes = std::io::copy(&mut decoder, output)?; Ok(n_bytes as usize) } /// Compress gzip data - pub fn compress(input: &[u8], output: &mut W, level: Option) -> Result { + pub fn compress(input: R, output: &mut W, level: Option) -> Result { let level = level.unwrap_or_else(|| 6); let mut encoder = GzEncoder::new(input, Compression::new(level)); let n_bytes = std::io::copy(&mut encoder, output)?; diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 00000000..23398148 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,451 @@ +//! Module holds native Rust objects exposed to Python, or objects +//! which wrap native Python objects to provide additional functionality +//! or tighter integration with de/compression algorithms. +//! +use std::fs::{File, OpenOptions}; +use std::io::{copy, Cursor, Read, Seek, SeekFrom, Write}; + +use crate::BytesType; +use numpy::PyArray1; +use pyo3::prelude::*; +use pyo3::types::{PyByteArray, PyBytes}; + +/// Internal wrapper for `numpy.array`/`PyArray1`, to provide Read + Write and other traits +pub struct RustyNumpyArray<'a> { + pub(crate) inner: &'a PyArray1, + pub(crate) cursor: Cursor<&'a mut [u8]>, +} +impl<'a> RustyNumpyArray<'a> { + pub(crate) fn from_vec(py: Python<'a>, v: Vec) -> Self { + let inner = PyArray1::from_vec(py, v); + Self { + inner, + cursor: Cursor::new(unsafe { inner.as_slice_mut().unwrap() }), + } + } + pub(crate) fn as_bytes(&self) -> &[u8] { + unsafe { self.inner.as_slice().unwrap() } + } +} +impl<'a> From<&'a PyArray1> for RustyNumpyArray<'a> { + fn from(inner: &'a PyArray1) -> Self { + Self { + inner, + cursor: Cursor::new(unsafe { inner.as_slice_mut().unwrap() }), + } + } +} +impl<'a> FromPyObject<'a> for RustyNumpyArray<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + let pybytes: &PyArray1 = ob.extract()?; + Ok(Self::from(pybytes)) + } +} +impl<'a> ToPyObject for RustyNumpyArray<'a> { + fn to_object(&self, py: Python<'_>) -> PyObject { + self.inner.to_object(py) + } +} +impl<'a> Write for RustyNumpyArray<'a> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.cursor.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.cursor.flush() + } +} +impl<'a> Read for RustyNumpyArray<'a> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.cursor.read(buf) + } +} + +impl<'a> Seek for RustyNumpyArray<'a> { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.cursor.seek(pos) + } +} + +/// Internal wrapper for `bytes`/`PyBytes`, to provide Read + Write and other traits +pub struct RustyPyBytes<'a> { + pub(crate) inner: &'a PyBytes, + pub(crate) cursor: Cursor<&'a mut [u8]>, +} +impl<'a> RustyPyBytes<'a> { + pub(crate) fn as_bytes(&self) -> &[u8] { + self.inner.as_bytes() + } +} +impl<'a> From<&'a PyBytes> for RustyPyBytes<'a> { + fn from(inner: &'a PyBytes) -> Self { + let ptr = inner.as_bytes().as_ptr(); + Self { + inner, + cursor: Cursor::new(unsafe { std::slice::from_raw_parts_mut(ptr as *mut _, inner.as_bytes().len()) }), + } + } +} +impl<'a> FromPyObject<'a> for RustyPyBytes<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + let pybytes: &PyBytes = ob.extract()?; + Ok(Self::from(pybytes)) + } +} +impl<'a> ToPyObject for RustyPyBytes<'a> { + fn to_object(&self, py: Python<'_>) -> PyObject { + self.inner.to_object(py) + } +} +impl<'a> Read for RustyPyBytes<'a> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.cursor.read(buf) + } +} +impl<'a> Write for RustyPyBytes<'a> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.cursor.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.cursor.flush() + } +} +impl<'a> Seek for RustyPyBytes<'a> { + fn seek(&mut self, style: SeekFrom) -> std::io::Result { + self.cursor.seek(style) + } +} + +/// Internal wrapper for `bytearray`/`PyByteArray`, to provide Read + Write and other traits +pub struct RustyPyByteArray<'a> { + pub(crate) inner: &'a PyByteArray, + pub(crate) cursor: Cursor<&'a mut [u8]>, +} +impl<'a> RustyPyByteArray<'a> { + pub(crate) fn new(py: Python<'a>, len: usize) -> Self { + let inner = PyByteArray::new_with(py, len, |_| Ok(())).unwrap(); + Self { + inner, + cursor: Cursor::new(unsafe { inner.as_bytes_mut() }), + } + } + pub(crate) fn as_bytes(&self) -> &[u8] { + unsafe { self.inner.as_bytes() } + } +} +impl<'a> From<&'a PyByteArray> for RustyPyByteArray<'a> { + fn from(inner: &'a PyByteArray) -> Self { + Self { + inner, + cursor: Cursor::new(unsafe { inner.as_bytes_mut() }), + } + } +} +impl<'a> FromPyObject<'a> for RustyPyByteArray<'a> { + fn extract(ob: &'a PyAny) -> PyResult { + let pybytes: &PyByteArray = ob.extract()?; + Ok(Self::from(pybytes)) + } +} +impl<'a> ToPyObject for RustyPyByteArray<'a> { + fn to_object(&self, py: Python<'_>) -> PyObject { + self.inner.to_object(py) + } +} +impl<'a> Write for RustyPyByteArray<'a> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if (self.cursor.position() as usize + buf.len()) > self.inner.len() { + let previous_pos = self.cursor.position(); + self.inner.resize(self.cursor.position() as usize + buf.len()).unwrap(); + self.cursor = Cursor::new(unsafe { self.inner.as_bytes_mut() }); + self.cursor.set_position(previous_pos); + } + self.cursor.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + if self.inner.len() != self.cursor.position() as usize { + let prev_pos = self.cursor.position(); + self.inner.resize(self.cursor.position() as usize).unwrap(); + self.cursor = Cursor::new(unsafe { self.inner.as_bytes_mut() }); + self.cursor.set_position(prev_pos); + } + Ok(()) + } +} +impl<'a> Read for RustyPyByteArray<'a> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.cursor.read(buf) + } +} + +impl<'a> Seek for RustyPyByteArray<'a> { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.cursor.seek(pos) + } +} + +/// A native Rust file-like object. Reading and writing takes place +/// through the Rust implementation, allowing access to the underlying +/// bytes in Python. +/// +/// ### Python Example +/// ```python +/// from cramjam import File +/// file = File("/tmp/file.txt") +/// file.write(b"bytes") +/// ``` +/// +/// ### Notes +/// Presently, the file's handle is managed by Rust's lifetime rules, in that +/// once it's garbage collected from Python's side, it will be closed. +/// +#[pyclass(name = "File")] +pub struct RustyFile { + pub(crate) inner: File, +} + +#[pymethods] +impl RustyFile { + /// ### Example + /// ```python + /// from cramjam import File + /// file = File("/tmp/file.txt", read=True, write=True, truncate=True) + /// file.write(b"bytes") + /// file.seek(2) + /// file.read() + /// b'tes' + /// ``` + #[new] + pub fn __init__( + path: &str, + read: Option, + write: Option, + truncate: Option, + append: Option, + ) -> PyResult { + Ok(Self { + inner: OpenOptions::new() + .read(read.unwrap_or_else(|| true)) + .write(write.unwrap_or_else(|| true)) + .truncate(truncate.unwrap_or_else(|| false)) + .create(true) // create if doesn't exist, but open if it does. + .append(append.unwrap_or_else(|| false)) + .open(path)?, + }) + } + /// Write some bytes to the file, where input data can be anything in [`BytesType`](../enum.BytesType.html) + pub fn write(&mut self, mut input: BytesType) -> PyResult { + let r = write(&mut input, self)?; + Ok(r as usize) + } + /// Read from the file in its current position, returns `bytes`; optionally specify number of + /// bytes to read. + pub fn read<'a>(&mut self, py: Python<'a>, n_bytes: Option) -> PyResult<&'a PyBytes> { + read(self, py, n_bytes) + } + /// Read from the file in its current position, into a [`BytesType`](../enum.BytesType.html) object. + pub fn readinto(&mut self, mut output: BytesType) -> PyResult { + let r = copy(self, &mut output)?; + Ok(r as usize) + } + /// Seek to a position within the file. `whence` follows the same values as [IOBase.seek](https://docs.python.org/3/library/io.html#io.IOBase.seek) + /// where: + /// ```bash + /// 0: from start of the stream + /// 1: from current stream position + /// 2: from end of the stream + /// ``` + pub fn seek(&mut self, position: isize, whence: Option) -> PyResult { + let pos = match whence.unwrap_or_else(|| 0) { + 0 => SeekFrom::Start(position as u64), + 1 => SeekFrom::Current(position as i64), + 2 => SeekFrom::End(position as i64), + _ => { + return Err(pyo3::exceptions::PyValueError::new_err( + "whence should be one of 0: seek from start, 1: seek from current, or 2: seek from end", + )) + } + }; + let r = Seek::seek(self, pos)?; + Ok(r as usize) + } + /// Whether the file is seekable; here just for compatibility, it always returns True. + pub fn seekable(&self) -> bool { + true + } + /// Give the current position of the file. + pub fn tell(&mut self) -> PyResult { + let r = self.inner.seek(SeekFrom::Current(0))?; + Ok(r as usize) + } + /// Set the length of the file. If less than current length, it will truncate to the size given; + /// otherwise will be null byte filled to the size. + pub fn set_len(&mut self, size: usize) -> PyResult<()> { + self.inner.set_len(size as u64)?; + Ok(()) + } + /// Truncate the file. + pub fn truncate(&mut self) -> PyResult<()> { + self.set_len(0) + } +} + +/// A native Rust file-like object. Reading and writing takes place +/// through the Rust implementation, allowing access to the underlying +/// bytes in Python. +/// +/// ### Python Example +/// ```python +/// >>> from cramjam import Buffer +/// >>> buf = Buffer(b"bytes") +/// >>> buf.read() +/// b'bytes' +/// ``` +/// +#[pyclass(name = "Buffer")] +#[derive(Default)] +pub struct RustyBuffer { + pub(crate) inner: Cursor>, +} + +/// A Buffer object, similar to [cramjam.File](struct.RustyFile.html) only the bytes are held in-memory +/// +/// ### Example +/// ```python +/// from cramjam import Buffer +/// buf = Buffer(b'start bytes') +/// buf.read(5) +/// b'start' +/// ``` +#[pymethods] +impl RustyBuffer { + /// Instantiate the object, optionally with any supported bytes-like object in [BytesType](../enum.BytesType.html) + #[new] + pub fn __init__(mut data: Option>) -> PyResult { + let mut buf = vec![]; + if let Some(bytes) = data.as_mut() { + bytes.read_to_end(&mut buf)?; + } + Ok(Self { + inner: Cursor::new(buf), + }) + } + /// Write some bytes to the buffer, where input data can be anything in [BytesType](../enum.BytesType.html) + pub fn write(&mut self, mut input: BytesType) -> PyResult { + let r = write(&mut input, self)?; + Ok(r as usize) + } + /// Read from the buffer in its current position, returns bytes; optionally specify number of bytes to read. + pub fn read<'a>(&mut self, py: Python<'a>, n_bytes: Option) -> PyResult<&'a PyBytes> { + read(self, py, n_bytes) + } + /// Read from the buffer in its current position, into a [BytesType](../enum.BytesType.html) object. + pub fn readinto(&mut self, mut output: BytesType) -> PyResult { + let r = copy(self, &mut output)?; + Ok(r as usize) + } + /// Seek to a position within the buffer. whence follows the same values as IOBase.seek where: + /// ```bash + /// 0: from start of the stream + /// 1: from current stream position + /// 2: from end of the stream + /// ``` + pub fn seek(&mut self, position: isize, whence: Option) -> PyResult { + let pos = match whence.unwrap_or_else(|| 0) { + 0 => SeekFrom::Start(position as u64), + 1 => SeekFrom::Current(position as i64), + 2 => SeekFrom::End(position as i64), + _ => { + return Err(pyo3::exceptions::PyValueError::new_err( + "whence should be one of 0: seek from start, 1: seek from current, or 2: seek from end", + )) + } + }; + let r = Seek::seek(self, pos)?; + Ok(r as usize) + } + /// Whether the buffer is seekable; here just for compatibility, it always returns True. + pub fn seekable(&self) -> bool { + true + } + /// Give the current position of the buffer. + pub fn tell(&self) -> usize { + self.inner.position() as usize + } + /// Set the length of the buffer. If less than current length, it will truncate to the size given; + /// otherwise will be null byte filled to the size. + pub fn set_len(&mut self, size: usize) -> PyResult<()> { + self.inner.get_mut().resize(size, 0); + Ok(()) + } + /// Truncate the buffer + pub fn truncate(&mut self) -> PyResult<()> { + self.inner.get_mut().truncate(0); + self.inner.set_position(0); + Ok(()) + } +} + +fn write(input: &mut BytesType, output: &mut W) -> std::io::Result { + let result = match input { + BytesType::RustyFile(data) => copy(&mut data.borrow_mut().inner, output)?, + BytesType::RustyBuffer(data) => copy(&mut data.borrow_mut().inner, output)?, + BytesType::ByteArray(data) => copy(data, output)?, + BytesType::NumpyArray(array) => copy(array, output)?, + BytesType::Bytes(data) => { + let buffer = data.as_bytes(); + copy(&mut Cursor::new(buffer), output)? + } + }; + Ok(result) +} + +fn read<'a, R: Read>(reader: &mut R, py: Python<'a>, n_bytes: Option) -> PyResult<&'a PyBytes> { + match n_bytes { + Some(n) => PyBytes::new_with(py, n, |buf| { + reader.read(buf)?; + Ok(()) + }), + None => { + let mut buf = vec![]; + reader.read_to_end(&mut buf)?; + Ok(PyBytes::new(py, buf.as_slice())) + } + } +} + +impl Seek for RustyBuffer { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.inner.seek(pos) + } +} +impl Seek for RustyFile { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.inner.seek(pos) + } +} +impl Write for RustyBuffer { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.inner.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} +impl Write for RustyFile { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.inner.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} +impl Read for RustyBuffer { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.inner.read(buf) + } +} +impl Read for RustyFile { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.inner.read(buf) + } +} diff --git a/src/lib.rs b/src/lib.rs index d1ac6aa0..1a2c5fda 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,179 +1,241 @@ +#![warn(missing_docs)] //! CramJam documentation of python exported functions for (de)compression of bytes //! -//! The API follows cramjam.`<>.compress` and cramjam.`<>.decompress` +//! Although this documentation is built using Cargo/Rust toolchain, the examples and API represent +//! the usable _Python_ API //! -//! Python Example: +//! In general, the API follows cramjam.`<>.compress` and cramjam.`<>.decompress` +//! as well as `compress_into`/`decompress_into` where it takes an input and output combination of any of the following: +//! - `numpy.array` (dtype=np.uint8) +//! - `bytes` +//! - `bytearray` +//! - [`cramjam.File`](io/struct.RustyFile.html) +//! - [`cramjam.Buffer`](./io/struct.RustyBuffer.html) +//! +//! ### Simple Python Example: //! //! ```python -//! data = b'some bytes here' -//! compressed = cramjam.snappy.compress(data) -//! decompressed = cramjam.snappy.decompress(compressed) -//! assert data == decompressed +//! >>> data = b'some bytes here' +//! >>> compressed = cramjam.snappy.compress(data) +//! >>> decompressed = cramjam.snappy.decompress(compressed) +//! >>> assert data == decompressed +//! >>> +//! ``` +//! +//! ### Example of de/compressing into different types. +//! +//! ```python +//! >>> import numpy as np +//! >>> from cramjam import snappy, Buffer +//! >>> +//! >>> data = np.frombuffer(b'some bytes here', dtype=np.uint8) +//! >>> data +//! array([115, 111, 109, 101, 32, 98, 121, 116, 101, 115, 32, 104, 101, +//! 114, 101], dtype=uint8) +//! >>> +//! >>> compressed = Buffer() +//! >>> snappy.compress_into(data, compressed) +//! 33 # 33 bytes written to compressed buffer +//! >>> +//! >>> compressed.tell() # Where is the buffer position? +//! 33 # goodie! +//! >>> +//! >>> compressed.seek(0) # Go back to the start of the buffer so we can prepare to decompress +//! >>> decompressed = b'0' * len(data) # let's write to `bytes` as output +//! >>> decompressed +//! b'000000000000000' +//! >>> +//! >>> snappy.decompress_into(compressed, decompressed) +//! 15 # 15 bytes written to decompressed +//! >>> decompressed +//! b'some bytes here' //! ``` - -// TODO: There is a lot of very similar, but slightly different code for each variant -// time should be spent perhaps with a macro or other alternative. -// Each variant is similar, but sometimes has subtly different APIs/logic. - -// TODO: Add output size estimation for each variant, now it's just snappy -// allow for resizing PyByteArray if over allocated; cannot resize PyBytes yet. pub mod brotli; pub mod deflate; pub mod exceptions; pub mod gzip; +pub mod io; pub mod lz4; pub mod snappy; pub mod zstd; use pyo3::prelude::*; -use pyo3::types::{PyByteArray, PyBytes}; +use crate::io::{RustyBuffer, RustyFile, RustyNumpyArray, RustyPyByteArray, RustyPyBytes}; use exceptions::{CompressionError, DecompressionError}; -use std::io::Write; +use std::io::{Read, Seek, SeekFrom, Write}; #[cfg(feature = "mimallocator")] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; +/// Any possible input/output to de/compression algorithms. +/// Typically, as a Python user, you never have to worry about this object. It's exposed here in +/// the documentation to see what types are acceptable for de/compression functions. #[derive(FromPyObject)] pub enum BytesType<'a> { + /// `bytes` #[pyo3(transparent, annotation = "bytes")] - Bytes(&'a PyBytes), + Bytes(RustyPyBytes<'a>), + /// `bytearray` #[pyo3(transparent, annotation = "bytearray")] - ByteArray(&'a PyByteArray), + ByteArray(RustyPyByteArray<'a>), + /// `numpy.array` with `dtype=np.uint8` + #[pyo3(transparent, annotation = "numpy")] + NumpyArray(RustyNumpyArray<'a>), + /// [`cramjam.File`](io/struct.RustyFile.html) + #[pyo3(transparent, annotation = "File")] + RustyFile(&'a PyCell), + /// [`cramjam.Buffer`](io/struct.RustyBuffer.html) + #[pyo3(transparent, annotation = "Buffer")] + RustyBuffer(&'a PyCell), } -impl<'a> BytesType<'a> { - #[allow(dead_code)] - fn len(&self) -> usize { - self.as_bytes().len() +impl<'a> Write for BytesType<'a> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let result = match self { + BytesType::RustyFile(out) => out.borrow_mut().inner.write(buf)?, + BytesType::RustyBuffer(out) => out.borrow_mut().inner.write(buf)?, + BytesType::ByteArray(out) => out.write(buf)?, + BytesType::NumpyArray(out) => out.write(buf)?, + BytesType::Bytes(out) => out.write(buf)?, + }; + Ok(result) } - fn as_bytes(&self) -> &'a [u8] { + fn flush(&mut self) -> std::io::Result<()> { match self { - Self::Bytes(b) => b.as_bytes(), - Self::ByteArray(b) => unsafe { b.as_bytes() }, + BytesType::RustyFile(f) => f.borrow_mut().flush(), + BytesType::RustyBuffer(b) => b.borrow_mut().flush(), + BytesType::ByteArray(_) | BytesType::Bytes(_) | BytesType::NumpyArray(_) => Ok(()), } } } - -impl<'a> IntoPy for BytesType<'a> { - fn into_py(self, py: Python) -> PyObject { +impl<'a> Read for BytesType<'a> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { match self { - Self::Bytes(bytes) => bytes.to_object(py), - Self::ByteArray(byte_array) => byte_array.to_object(py), + BytesType::RustyFile(data) => data.borrow_mut().inner.read(buf), + BytesType::RustyBuffer(data) => data.borrow_mut().inner.read(buf), + BytesType::ByteArray(data) => data.read(buf), + BytesType::NumpyArray(array) => array.read(buf), + BytesType::Bytes(data) => data.read(buf), } } } - -/// A wrapper to PyByteArray, providing the std::io::Write impl -pub struct WriteablePyByteArray<'a> { - array: &'a PyByteArray, - position: usize, -} - -impl<'a> WriteablePyByteArray<'a> { - pub fn new(py: Python<'a>, len: usize) -> Self { - Self { - array: PyByteArray::new_with(py, len, |_| Ok(())).unwrap(), - position: 0, +impl<'a> Seek for BytesType<'a> { + fn seek(&mut self, style: SeekFrom) -> std::io::Result { + match self { + BytesType::RustyFile(f) => f.borrow_mut().inner.seek(style), + BytesType::RustyBuffer(b) => b.borrow_mut().inner.seek(style), + BytesType::ByteArray(a) => a.seek(style), + BytesType::NumpyArray(a) => a.seek(style), + BytesType::Bytes(b) => b.seek(style), } } - pub fn into_inner(mut self) -> PyResult<&'a PyByteArray> { - self.flush() - .map_err(|e| pyo3::exceptions::PyBufferError::new_err(e.to_string()))?; - Ok(self.array) - } } -impl<'a> Write for WriteablePyByteArray<'a> { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - if (self.position + buf.len()) > self.array.len() { - self.array.resize(self.position + buf.len()).unwrap() - } - let array_bytes = unsafe { self.array.as_bytes_mut() }; - - //let mut wtr = Cursor::new(&mut array_bytes[self.position..]); - //let n_bytes = wtr.write(buf).unwrap(); - let buf_len = buf.len(); - array_bytes[self.position..self.position + buf_len].copy_from_slice(buf); - - self.position += buf.len(); - Ok(buf.len()) +impl<'a> BytesType<'a> { + #[allow(dead_code)] + fn len(&self) -> usize { + self.as_bytes().len() } - fn flush(&mut self) -> std::io::Result<()> { - if self.array.len() != self.position { - self.array.resize(self.position).unwrap(); + fn as_bytes(&self) -> &'_ [u8] { + match self { + Self::Bytes(b) => b.as_bytes(), + Self::ByteArray(b) => b.as_bytes(), + _ => unimplemented!("Converting Rust native types to bytes is not supported"), } - Ok(()) } } -/// Expose de/compression_into(data: BytesType<'_>, array: &PyArray1) -> PyResult -/// functions to allow de/compress bytes into a pre-allocated Python array. -/// -/// This will handle gaining access to the Python's array as a buffer for an underlying de/compression -/// function which takes the normal `&[u8]` and `Output` types -#[macro_export] -macro_rules! generic_into { - ($op:ident($input:ident -> $output:ident) $(, $level:ident)?) => { - { - let mut array_mut = unsafe { $output.as_array_mut() }; - - let buffer: &mut [u8] = to_py_err!(DecompressionError -> array_mut.as_slice_mut().ok_or_else(|| { - pyo3::exceptions::PyBufferError::new_err("Failed to get mutable slice from array.") - }))?; - let mut cursor = Cursor::new(buffer); - let size = to_py_err!(DecompressionError -> self::internal::$op($input.as_bytes(), &mut cursor $(, $level)?))?; - Ok(size) +impl<'a> IntoPy for BytesType<'a> { + fn into_py(self, py: Python) -> PyObject { + match self { + Self::Bytes(bytes) => bytes.to_object(py), + Self::ByteArray(byte_array) => byte_array.to_object(py), + Self::RustyFile(file) => file.to_object(py), + Self::RustyBuffer(buffer) => buffer.to_object(py), + Self::NumpyArray(array) => array.to_object(py), } } } +/// Macro for generating the implementation of de/compression against a variant interface #[macro_export] macro_rules! generic { - ($op:ident($input:ident), py=$py:ident, output_len=$output_len:ident $(, level=$level:ident)?) => { + ($op:ident($input:expr), py=$py:ident, output_len=$output_len:ident $(, level=$level:ident)?) => { { - let bytes = $input.as_bytes(); + use crate::io::{RustyPyBytes, RustyPyByteArray, RustyNumpyArray}; + match $input { - BytesType::Bytes(_) => match $output_len { - Some(len) => { - let pybytes = PyBytes::new_with($py, len, |buffer| { - let mut cursor = Cursor::new(buffer); + BytesType::Bytes(_) => { + match $output_len { + Some(len) => { + let pybytes = PyBytes::new_with($py, len, |buffer| { + let mut cursor = Cursor::new(buffer); + if stringify!($op) == "compress" { + to_py_err!(CompressionError -> self::internal::$op($input, &mut cursor $(, $level)? ))?; + } else { + to_py_err!(DecompressionError -> self::internal::$op($input, &mut cursor $(, $level)? ))?; + } + Ok(()) + })?; + Ok(BytesType::Bytes(RustyPyBytes::from(pybytes))) + } + None => { + let mut buffer = Vec::new(); if stringify!($op) == "compress" { - to_py_err!(CompressionError -> self::internal::$op(bytes, &mut cursor $(, $level)? ))?; + to_py_err!(CompressionError -> self::internal::$op($input, &mut Cursor::new(&mut buffer) $(, $level)? ))?; } else { - to_py_err!(DecompressionError -> self::internal::$op(bytes, &mut cursor $(, $level)? ))?; + to_py_err!(DecompressionError -> self::internal::$op($input, &mut Cursor::new(&mut buffer) $(, $level)? ))?; } - Ok(()) - })?; - Ok(BytesType::Bytes(pybytes)) - } - None => { - let mut buffer = Vec::new(); - if stringify!($op) == "compress" { - to_py_err!(CompressionError -> self::internal::$op(bytes, &mut buffer $(, $level)? ))?; - } else { - to_py_err!(DecompressionError -> self::internal::$op(bytes, &mut buffer $(, $level)? ))?; - } - Ok(BytesType::Bytes(PyBytes::new($py, &buffer))) + Ok(BytesType::Bytes(RustyPyBytes::from(PyBytes::new($py, &buffer)))) + } } }, BytesType::ByteArray(_) => { - let mut pybytes = WriteablePyByteArray::new($py, $output_len.unwrap_or_else(|| 0)); + let mut pybytes = RustyPyByteArray::new($py, $output_len.unwrap_or_else(|| 0)); if stringify!($op) == "compress" { - to_py_err!(CompressionError -> self::internal::$op(bytes, &mut pybytes $(, $level)? ))?; + to_py_err!(CompressionError -> self::internal::$op($input, &mut pybytes $(, $level)? ))?; } else { - to_py_err!(DecompressionError -> self::internal::$op(bytes, &mut pybytes $(, $level)? ))?; + to_py_err!(DecompressionError -> self::internal::$op($input, &mut pybytes $(, $level)? ))?; } - Ok(BytesType::ByteArray(pybytes.into_inner()?)) + Ok(BytesType::ByteArray(pybytes)) + }, + BytesType::NumpyArray(_) => { + let mut output = Vec::new(); + if stringify!($op) == "compress" { + to_py_err!(CompressionError -> self::internal::$op($input, &mut Cursor::new(&mut output) $(, $level)? ))?; + } else { + to_py_err!(DecompressionError -> self::internal::$op($input, &mut Cursor::new(&mut output) $(, $level)? ))?; + } + Ok(BytesType::NumpyArray(RustyNumpyArray::from_vec($py, output))) + }, + BytesType::RustyFile(_) => { + let mut output = crate::io::RustyBuffer::default(); + if stringify!($op) == "compress" { + to_py_err!(CompressionError -> self::internal::$op($input, &mut output $(, $level)? ))?; + } else { + to_py_err!(DecompressionError -> self::internal::$op($input, &mut output $(, $level)? ))?; + } + Ok(BytesType::RustyBuffer(PyCell::new($py, output).unwrap())) + }, + BytesType::RustyBuffer(_) => { + let mut output = crate::io::RustyBuffer::default(); + if stringify!($op) == "compress" { + to_py_err!(CompressionError -> self::internal::$op($input, &mut output $(, $level)? ))?; + } else { + to_py_err!(DecompressionError -> self::internal::$op($input, &mut output $(, $level)? ))?; + } + Ok(BytesType::RustyBuffer(PyCell::new($py, output).unwrap())) } } } } } +/// Macro to convert an error into a specific Python exception. #[macro_export] macro_rules! to_py_err { ($error:ident -> $expr:expr) => { @@ -194,7 +256,8 @@ fn cramjam(py: Python, m: &PyModule) -> PyResult<()> { m.add("__version__", env!("CARGO_PKG_VERSION"))?; m.add("CompressionError", py.get_type::())?; m.add("DecompressionError", py.get_type::())?; - + m.add_class::()?; + m.add_class::()?; make_submodule!(py -> m -> snappy); make_submodule!(py -> m -> brotli); make_submodule!(py -> m -> lz4); @@ -213,9 +276,9 @@ mod tests { // Default testing data fn gen_data() -> Vec { (0..1000000) - .map(|_| "oh what a beautiful morning, oh what a beautiful day!!") - .collect::() - .into_bytes() + .map(|_| b"oh what a beautiful morning, oh what a beautiful day!!".to_vec()) + .flat_map(|v| v) + .collect() } // Single test generation @@ -230,10 +293,9 @@ mod tests { let compressed_size = if stringify!($decompress_output) == "Slice" { compressed = (0..data.len()).map(|_| 0).collect::>(); let mut cursor = Cursor::new(compressed.as_mut_slice()); - crate::$variant::internal::compress(&data, &mut cursor $(, $level)?).unwrap() + crate::$variant::internal::compress(&mut Cursor::new(data.as_slice()), &mut cursor $(, $level)?).unwrap() } else { - - crate::$variant::internal::compress(&data, &mut compressed $(, $level)?).unwrap() + crate::$variant::internal::compress(&mut Cursor::new(data.as_slice()), &mut Cursor::new(&mut compressed) $(, $level)?).unwrap() }; assert_eq!(compressed_size, $compressed_len); @@ -244,10 +306,9 @@ mod tests { let decompressed_size = if stringify!($decompress_output) == "Slice" { decompressed = (0..data.len()).map(|_| 0).collect::>(); let mut cursor = Cursor::new(decompressed.as_mut_slice()); - crate::$variant::internal::decompress(&compressed, &mut cursor).unwrap() + crate::$variant::internal::decompress(&mut Cursor::new(&compressed), &mut cursor).unwrap() } else { - - crate::$variant::internal::decompress(&compressed, &mut decompressed).unwrap() + crate::$variant::internal::decompress(&mut Cursor::new(&compressed), &mut decompressed).unwrap() }; assert_eq!(decompressed_size, data.len()); if &decompressed[..decompressed_size] != &data { @@ -276,25 +337,5 @@ mod tests { test_variant!(brotli, compressed_len = 729, level = None); test_variant!(deflate, compressed_len = 157174, level = None); test_variant!(zstd, compressed_len = 4990, level = None); - - #[test] - fn test_snappy_raw_into_round_trip() { - let data = gen_data(); - let max_compress_len = snap::raw::max_compress_len(data.len()); - let mut compressed_buffer = vec![0; max_compress_len]; - - let n_bytes = - crate::snappy::internal::compress_raw_into(&data, &mut Cursor::new(&mut compressed_buffer)).unwrap(); - assert_eq!(n_bytes, 2563328); // raw compressed len - - let decompress_len = snap::raw::decompress_len(&compressed_buffer[..n_bytes]).unwrap(); - let mut decompressed_buffer = vec![0; decompress_len]; - let n_bytes = crate::snappy::internal::decompress_raw_into( - &compressed_buffer[..n_bytes], - &mut Cursor::new(&mut decompressed_buffer), - ) - .unwrap(); - assert_eq!(n_bytes, data.len()); - assert_eq!(&data, &decompressed_buffer[..n_bytes]); - } + test_variant!(lz4, compressed_len = 303278, level = None); } diff --git a/src/lz4.rs b/src/lz4.rs index cc5ff345..e21b0ec2 100644 --- a/src/lz4.rs +++ b/src/lz4.rs @@ -1,13 +1,17 @@ +//! lz4 de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; use crate::{to_py_err, BytesType}; use pyo3::prelude::*; -use pyo3::types::{PyByteArray, PyBytes}; +use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; +use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; + m.add_function(wrap_pyfunction!(compress_into, m)?)?; + m.add_function(wrap_pyfunction!(decompress_into, m)?)?; Ok(()) } @@ -22,16 +26,7 @@ pub fn init_py_module(m: &PyModule) -> PyResult<()> { #[pyfunction] #[allow(unused_variables)] // TODO: Make use of output_len for lz4 pub fn decompress<'a>(py: Python<'a>, data: BytesType<'a>, output_len: Option) -> PyResult> { - match data { - BytesType::Bytes(input) => { - let out = to_py_err!(DecompressionError -> self::internal::decompress(input.as_bytes()))?; - Ok(BytesType::Bytes(PyBytes::new(py, &out))) - } - BytesType::ByteArray(input) => { - let out = to_py_err!(DecompressionError -> self::internal::decompress(unsafe { input.as_bytes() }))?; - Ok(BytesType::ByteArray(PyByteArray::new(py, &out))) - } - } + crate::generic!(decompress(data), py = py, output_len = output_len) } /// lZ4 compression. @@ -43,39 +38,64 @@ pub fn decompress<'a>(py: Python<'a>, data: BytesType<'a>, output_len: Option>> cramjam.lz4.compress(b'some bytes here', output_len=Optional[int]) /// ``` #[pyfunction] -#[allow(unused_variables)] pub fn compress<'a>( py: Python<'a>, - data: BytesType<'a>, + mut data: BytesType<'a>, level: Option, output_len: Option, ) -> PyResult> { - match data { - BytesType::Bytes(input) => { - let out = to_py_err!(CompressionError -> self::internal::compress(input.as_bytes(), level))?; - Ok(BytesType::Bytes(PyBytes::new(py, &out))) - } - BytesType::ByteArray(input) => { - let out = to_py_err!(CompressionError -> self::internal::compress(unsafe { input.as_bytes() }, level))?; - Ok(BytesType::ByteArray(PyByteArray::new(py, &out))) - } - } + crate::generic!(compress(&mut data), py = py, output_len = output_len, level = level) +} + +/// Compress directly into an output buffer +#[pyfunction] +pub fn compress_into<'a>( + _py: Python<'a>, + mut input: BytesType<'a>, + mut output: BytesType<'a>, + level: Option, +) -> PyResult { + let r = internal::compress(&mut input, &mut output, level)?; + Ok(r) +} + +/// Decompress directly into an output buffer +#[pyfunction] +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r) } pub(crate) mod internal { - use std::error::Error; + use lz4::{Decoder, EncoderBuilder}; + use std::io::{Error, Read, Seek, SeekFrom, Write}; /// Decompress lz4 data - pub fn decompress(data: &[u8]) -> Result, Box> { - lz_fear::framed::decompress_frame(data).map_err(|err| err.into()) + pub fn decompress(input: R, output: &mut W) -> Result { + let mut decoder = Decoder::new(input)?; + let n_bytes = std::io::copy(&mut decoder, output)?; + decoder.finish().1?; + Ok(n_bytes as usize) } /// Compress lz4 data - // TODO: lz-fear does not yet support level - pub fn compress(data: &[u8], level: Option) -> Result, Box> { - let _ = level.unwrap_or_else(|| 4); - let mut buf = vec![]; - lz_fear::framed::CompressionSettings::default().compress(data, &mut buf)?; - Ok(buf) + pub fn compress( + input: &mut R, + output: &mut W, + level: Option, + ) -> Result { + let start_pos = output.seek(SeekFrom::Current(0))?; + let mut encoder = EncoderBuilder::new() + .auto_flush(true) + .level(level.unwrap_or_else(|| 4)) + .build(output)?; + + // this returns, bytes read from uncompressed, input; we want bytes written + // but lz4 only implements Read for Encoder + std::io::copy(input, &mut encoder)?; + let (w, r) = encoder.finish(); + r?; + let ending_pos = w.seek(SeekFrom::Current(0))?; + Ok((ending_pos - start_pos) as usize) } } diff --git a/src/snappy.rs b/src/snappy.rs index e7074eff..534d96f5 100644 --- a/src/snappy.rs +++ b/src/snappy.rs @@ -1,13 +1,13 @@ +//! snappy de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; -use crate::{to_py_err, BytesType, WriteablePyByteArray}; -use numpy::PyArray1; +use crate::{to_py_err, BytesType}; use pyo3::prelude::*; -use pyo3::types::{PyByteArray, PyBytes}; +use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; m.add_function(wrap_pyfunction!(compress_raw, m)?)?; @@ -56,25 +56,8 @@ pub fn compress<'a>(py: Python<'a>, data: BytesType<'a>, output_len: Option>> cramjam.snappy.decompress_raw(compressed_raw_bytes) /// ``` #[pyfunction] -pub fn decompress_raw<'a>(py: Python<'a>, data: BytesType<'a>) -> PyResult> { - let output_len = to_py_err!(DecompressionError -> snap::raw::decompress_len(data.as_bytes()))?; - - match data { - BytesType::Bytes(_) => { - let pybytes = PyBytes::new_with(py, output_len, |output| { - to_py_err!(DecompressionError -> self::internal::decompress_raw_into(data.as_bytes(), &mut Cursor::new(output)))?; - Ok(()) - })?; - Ok(BytesType::Bytes(pybytes)) - } - BytesType::ByteArray(_) => { - let pybytes = PyByteArray::new_with(py, output_len, |output| { - to_py_err!(DecompressionError -> self::internal::decompress_raw_into(data.as_bytes(), &mut Cursor::new(output)))?; - Ok(()) - })?; - Ok(BytesType::ByteArray(pybytes)) - } - } +pub fn decompress_raw<'a>(py: Python<'a>, data: BytesType<'a>, output_len: Option) -> PyResult> { + crate::generic!(decompress_raw(data), py = py, output_len = output_len) } /// Snappy compression raw. @@ -86,52 +69,36 @@ pub fn decompress_raw<'a>(py: Python<'a>, data: BytesType<'a>) -> PyResult>> cramjam.snappy.compress_raw(b'some bytes here') /// ``` #[pyfunction] -pub fn compress_raw<'a>(py: Python<'a>, data: BytesType<'a>) -> PyResult> { - let output_len = snap::raw::max_compress_len(data.len()); - - match data { - BytesType::Bytes(_) => { - let mut output = vec![0; output_len]; - let n_bytes = to_py_err!(CompressionError -> self::internal::compress_raw_into(data.as_bytes(), &mut Cursor::new(output.as_mut_slice())))?; - output.truncate(n_bytes); - Ok(BytesType::Bytes(PyBytes::new(py, &output))) - } - BytesType::ByteArray(_) => { - let mut actual_size = 0; - let pybytes = PyByteArray::new_with(py, output_len, |output| { - let mut cursor = Cursor::new(output); - actual_size = - to_py_err!(CompressionError -> self::internal::compress_raw_into(data.as_bytes(), &mut cursor))?; - Ok(()) - })?; - pybytes.resize(actual_size)?; - Ok(BytesType::ByteArray(pybytes)) - } - } +pub fn compress_raw<'a>(py: Python<'a>, data: BytesType<'a>, output_len: Option) -> PyResult> { + crate::generic!(compress_raw(data), py = py, output_len = output_len) } /// Compress directly into an output buffer #[pyfunction] -pub fn compress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &PyArray1) -> PyResult { - crate::generic_into!(compress(data -> array)) +pub fn compress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::compress(input, &mut output)?; + Ok(r) } /// Decompress directly into an output buffer #[pyfunction] -pub fn decompress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &'a PyArray1) -> PyResult { - crate::generic_into!(decompress(data -> array)) +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r as usize) } /// Compress raw format directly into an output buffer #[pyfunction] -pub fn compress_raw_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &PyArray1) -> PyResult { - crate::generic_into!(compress_raw_into(data -> array)) +pub fn compress_raw_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = to_py_err!(CompressionError -> internal::compress_raw(input, &mut output))?; + Ok(r) } /// Decompress raw format directly into an output buffer #[pyfunction] -pub fn decompress_raw_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &PyArray1) -> PyResult { - crate::generic_into!(decompress_raw_into(data -> array)) +pub fn decompress_raw_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = to_py_err!(DecompressionError -> internal::decompress_raw(input, &mut output))?; + Ok(r) } /// Get the expected max compressed length for snappy raw compression; this is the size @@ -149,33 +116,148 @@ pub fn decompress_raw_len<'a>(_py: Python<'a>, data: BytesType<'a>) -> PyResult< } pub(crate) mod internal { - use snap::raw::{Decoder, Encoder}; + use crate::BytesType; + use snap::raw::{decompress_len, max_compress_len, Decoder, Encoder}; use snap::read::{FrameDecoder, FrameEncoder}; - use std::io::{Cursor, Error, Write}; + use std::io::{Cursor, Error, Read, Write}; + + pub(crate) struct RawEncoder<'a, 'b> { + inner: &'b BytesType<'a>, + overflow: Option>>, + encoder: Encoder, + is_finished: bool, + } + impl<'a, 'b> RawEncoder<'a, 'b> { + pub fn new(inner: &'b BytesType<'a>) -> Self { + Self { + inner, + encoder: Encoder::new(), + overflow: None, + is_finished: false, + } + } + fn init_read(&mut self, bytes: &[u8], buf: &mut [u8]) -> std::io::Result { + let len = max_compress_len(bytes.len()); + if buf.len() >= len { + let n = self.encoder.compress(bytes, buf)?; + self.is_finished = true; // if overflow is set, it will return 0 next iter + Ok(n) + } else { + let mut overflow = vec![0; len]; + let n = self.encoder.compress(bytes, overflow.as_mut_slice())?; + overflow.truncate(n); + let mut overflow = Cursor::new(overflow); + let r = overflow.read(buf)?; + self.overflow = Some(overflow); + Ok(r) + } + } + } + impl<'a, 'b> Read for RawEncoder<'a, 'b> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.is_finished { + return Ok(0); + } + match self.overflow.as_mut() { + Some(overflow) => overflow.read(buf), + None => match self.inner { + BytesType::Bytes(pybytes) => self.init_read(pybytes.as_bytes(), buf), + BytesType::ByteArray(pybytes) => self.init_read(pybytes.as_bytes(), buf), + BytesType::NumpyArray(array) => self.init_read(array.as_bytes(), buf), + BytesType::RustyBuffer(buffer) => { + let buffer_ref = buffer.borrow(); + self.init_read(buffer_ref.inner.get_ref(), buf) + } + BytesType::RustyFile(file) => { + let mut buffer = vec![]; + file.borrow_mut().read_to_end(&mut buffer)?; + self.init_read(buffer.as_slice(), buf) + } + }, + } + } + } + + pub(crate) struct RawDecoder<'a, 'b> { + inner: &'b BytesType<'a>, + overflow: Option>>, + decoder: Decoder, + is_finished: bool, + } + impl<'a, 'b> RawDecoder<'a, 'b> { + pub fn new(inner: &'b BytesType<'a>) -> Self { + Self { + inner, + decoder: Decoder::new(), + overflow: None, + is_finished: false, + } + } + fn init_read(&mut self, bytes: &[u8], buf: &mut [u8]) -> std::io::Result { + let len = decompress_len(bytes)?; + if buf.len() >= len { + let n = self.decoder.decompress(bytes, buf)?; + self.is_finished = true; // if overflow is set, it will return 0 next iter + Ok(n) + } else { + let mut overflow = vec![0; len]; + let n = self.decoder.decompress(bytes, overflow.as_mut_slice())?; + overflow.truncate(n); + let mut overflow = Cursor::new(overflow); + let r = overflow.read(buf)?; + self.overflow = Some(overflow); + Ok(r) + } + } + } + impl<'a, 'b> Read for RawDecoder<'a, 'b> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.is_finished { + return Ok(0); + } + match self.overflow.as_mut() { + Some(overflow) => overflow.read(buf), + None => match self.inner { + BytesType::Bytes(pybytes) => self.init_read(pybytes.as_bytes(), buf), + BytesType::ByteArray(pybytes) => self.init_read(pybytes.as_bytes(), buf), + BytesType::NumpyArray(array) => self.init_read(array.as_bytes(), buf), + BytesType::RustyBuffer(buffer) => { + let buffer_ref = buffer.borrow(); + self.init_read(buffer_ref.inner.get_ref(), buf) + } + BytesType::RustyFile(file) => { + let mut buffer = vec![]; + file.borrow_mut().read_to_end(&mut buffer)?; + self.init_read(buffer.as_slice(), buf) + } + }, + } + } + } /// Decompress snappy data raw into a mutable slice - pub fn decompress_raw_into(input: &[u8], output: &mut Cursor<&mut [u8]>) -> Result { - let mut decoder = Decoder::new(); - let buffer = output.get_mut(); - decoder.decompress(input, *buffer) + pub fn decompress_raw<'a, W: Write>(input: BytesType<'a>, output: &mut W) -> std::io::Result { + let mut decoder = RawDecoder::new(&input); + let n_bytes = std::io::copy(&mut decoder, output)?; + Ok(n_bytes as usize) } /// Compress snappy data raw into a mutable slice - pub fn compress_raw_into(input: &[u8], output: &mut Cursor<&mut [u8]>) -> Result { - let mut encoder = Encoder::new(); - let buffer = output.get_mut(); - encoder.compress(input, buffer) + pub fn compress_raw<'a, W: Write>(input: BytesType<'a>, output: &'a mut W) -> std::io::Result { + let mut encoder = RawEncoder::new(&input); + let n_bytes = std::io::copy(&mut encoder, output)?; + Ok(n_bytes as usize) } /// Decompress snappy data framed - pub fn decompress(input: &[u8], output: &mut W) -> Result { + pub fn decompress(input: R, output: &mut W) -> Result { let mut decoder = FrameDecoder::new(input); let n_bytes = std::io::copy(&mut decoder, output)?; Ok(n_bytes as usize) } /// Decompress snappy data framed - pub fn compress(data: &[u8], output: &mut W) -> Result { + pub fn compress(data: R, output: &mut W) -> Result { let mut encoder = FrameEncoder::new(data); let n_bytes = std::io::copy(&mut encoder, output)?; Ok(n_bytes as usize) diff --git a/src/zstd.rs b/src/zstd.rs index 0e3c8ea9..00a290fc 100644 --- a/src/zstd.rs +++ b/src/zstd.rs @@ -1,13 +1,13 @@ +//! zstd de/compression interface use crate::exceptions::{CompressionError, DecompressionError}; -use crate::{to_py_err, BytesType, WriteablePyByteArray}; -use numpy::PyArray1; +use crate::{to_py_err, BytesType}; use pyo3::prelude::*; use pyo3::types::PyBytes; use pyo3::wrap_pyfunction; use pyo3::{PyResult, Python}; use std::io::Cursor; -pub fn init_py_module(m: &PyModule) -> PyResult<()> { +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(compress, m)?)?; m.add_function(wrap_pyfunction!(decompress, m)?)?; m.add_function(wrap_pyfunction!(compress_into, m)?)?; @@ -48,35 +48,37 @@ pub fn compress<'a>( #[pyfunction] pub fn compress_into<'a>( _py: Python<'a>, - data: BytesType<'a>, - array: &PyArray1, + input: BytesType<'a>, + mut output: BytesType<'a>, level: Option, ) -> PyResult { - crate::generic_into!(compress(data -> array), level) -} - -/// Decompress directly into an output buffer -#[pyfunction] -pub fn decompress_into<'a>(_py: Python<'a>, data: BytesType<'a>, array: &'a PyArray1) -> PyResult { - crate::generic_into!(decompress(data -> array)) + let r = internal::compress(input, &mut output, level)?; + Ok(r) } pub(crate) mod internal { - use std::io::{Error, Write}; + use std::io::{Error, Read, Write}; /// Decompress gzip data - pub fn decompress(input: &[u8], output: &mut W) -> Result { + pub fn decompress(input: R, output: &mut W) -> Result { let mut decoder = zstd::stream::read::Decoder::new(input)?; let n_bytes = std::io::copy(&mut decoder, output)?; Ok(n_bytes as usize) } /// Compress gzip data - pub fn compress(input: &[u8], output: &mut W, level: Option) -> Result { + pub fn compress(input: R, output: &mut W, level: Option) -> Result { let level = level.unwrap_or_else(|| 0); // 0 will use zstd's default, currently 11 let mut encoder = zstd::stream::read::Encoder::new(input, level)?; let n_bytes = std::io::copy(&mut encoder, output)?; Ok(n_bytes as usize) } } + +/// Decompress directly into an output buffer +#[pyfunction] +pub fn decompress_into<'a>(_py: Python<'a>, input: BytesType<'a>, mut output: BytesType<'a>) -> PyResult { + let r = internal::decompress(input, &mut output)?; + Ok(r) +} diff --git a/tests/test_rust_io.py b/tests/test_rust_io.py new file mode 100644 index 00000000..29ca8e9c --- /dev/null +++ b/tests/test_rust_io.py @@ -0,0 +1,57 @@ +import pytest + +from cramjam import File, Buffer + + +@pytest.mark.parametrize("Obj", (File, Buffer)) +def test_obj_api(tmpdir, Obj): + if isinstance(Obj, File): + buf = File(str(tmpdir.join("file.txt"))) + else: + buf = Buffer() + + assert buf.write(b"bytes") == 5 + assert buf.tell() == 5 + assert buf.seek(0) == 0 + assert buf.read() == b"bytes" + assert buf.seek(-1, 2) == 4 # set one byte backwards from end; position 4 + assert buf.read() == b"s" + assert buf.seek(-2, whence=1) == 3 # set two bytes from current (end): position 3 + assert buf.read() == b"es" + + with pytest.raises(ValueError): + buf.seek(1, 3) # only 0, 1, 2 are valid seek from positions + + for out in ( + b"12345", + bytearray(b"12345"), + File(str(tmpdir.join("test.txt"))), + Buffer(), + ): + buf.seek(0) + + expected = b"bytes" + + buf.readinto(out) + + # Will update the output buffer + if isinstance(out, File) or isinstance(out, Buffer): + out.seek(0) + assert out.read() == expected + elif isinstance(out, bytearray): + assert out == bytearray(expected) + else: + assert out == expected + + # Set the length + buf.set_len(2) + buf.seek(0) + assert buf.read() == b"by" + buf.set_len(10) + buf.seek(0) + assert buf.read() == b"by\x00\x00\x00\x00\x00\x00\x00\x00" + + # truncate + buf.truncate() + buf.seek(0) + assert buf.read() == b"" diff --git a/tests/test_variants.py b/tests/test_variants.py index 58af8a66..2d1062e8 100644 --- a/tests/test_variants.py +++ b/tests/test_variants.py @@ -7,10 +7,13 @@ def same_same(a, b): return hashlib.md5(a).hexdigest() == hashlib.md5(b).hexdigest() + def test_has_version(): from cramjam import __version__ + assert isinstance(__version__, str) + @pytest.mark.parametrize("is_bytearray", (True, False)) @pytest.mark.parametrize( "variant_str", ("snappy", "brotli", "lz4", "gzip", "deflate", "zstd") @@ -19,7 +22,7 @@ def test_variants_simple(variant_str, is_bytearray): variant = getattr(cramjam, variant_str) - uncompressed = b"some bytes to compress 123" * 1000 + uncompressed = b"some bytes to compress 123" * 100000 if is_bytearray: uncompressed = bytearray(uncompressed) @@ -41,25 +44,110 @@ def test_variants_raise_exception(variant_str): variant.decompress(b"sknow") -@pytest.mark.parametrize("variant_str", ("snappy", "brotli", "gzip", "deflate", "zstd")) -def test_variants_de_compress_into(variant_str): - - # TODO: support lz4 de/compress_into - +@pytest.mark.parametrize( + "input_type", (bytes, bytearray, "numpy", cramjam.Buffer, cramjam.File) +) +@pytest.mark.parametrize( + "output_type", (bytes, bytearray, "numpy", cramjam.Buffer, cramjam.File) +) +@pytest.mark.parametrize( + "variant_str", ("snappy", "brotli", "gzip", "deflate", "zstd", "lz4") +) +def test_variants_compress_into(variant_str, input_type, output_type, tmpdir): variant = getattr(cramjam, variant_str) - data = b"oh what a beautiful morning, oh what a beautiful day!!" * 1000000 + raw_data = b"oh what a beautiful morning, oh what a beautiful day!!" * 10000 + + # Setup input + if input_type == "numpy": + input = np.frombuffer(raw_data, dtype=np.uint8) + elif input_type == cramjam.File: + input = cramjam.File(str(tmpdir.join("input.txt"))) + input.write(raw_data) + input.seek(0) + elif input_type == cramjam.Buffer: + input = cramjam.Buffer() + input.write(raw_data) + input.seek(0) + else: + input = input_type(raw_data) + + compressed = variant.compress(raw_data) + compressed_len = len(compressed) + + # Setup output buffer + if output_type == "numpy": + output = np.zeros(compressed_len, dtype=np.uint8) + elif output_type == cramjam.File: + output = cramjam.File(str(tmpdir.join("output.txt"))) + elif output_type == cramjam.Buffer: + output = cramjam.Buffer() + else: + output = output_type(b"0" * compressed_len) + + n_bytes = variant.compress_into(input, output) + assert n_bytes == compressed_len + + if hasattr(output, "read"): + output.seek(0) + output = output.read() + elif hasattr(output, "tobytes"): + output = output.tobytes() + else: + output = bytes(output) + assert same_same(output, compressed) + - compressed_array = np.zeros(len(data), dtype=np.uint8) # plenty of space - compressed_size = variant.compress_into(data, compressed_array) - decompressed = variant.decompress(compressed_array[:compressed_size].tobytes()) - assert same_same(decompressed, data) +@pytest.mark.parametrize( + "input_type", (bytes, bytearray, "numpy", cramjam.Buffer, cramjam.File) +) +@pytest.mark.parametrize( + "output_type", (bytes, bytearray, "numpy", cramjam.Buffer, cramjam.File) +) +@pytest.mark.parametrize( + "variant_str", ("snappy", "brotli", "gzip", "deflate", "zstd", "lz4") +) +def test_variants_decompress_into(variant_str, input_type, output_type, tmpdir): + variant = getattr(cramjam, variant_str) - compressed = variant.compress(data) - decompressed_array = np.zeros(len(data), np.uint8) - decompressed_size = variant.decompress_into(compressed, decompressed_array) - decompressed = decompressed_array[:decompressed_size].tobytes() - assert same_same(decompressed, data) + raw_data = b"oh what a beautiful morning, oh what a beautiful day!!" * 100 + compressed = variant.compress(raw_data) + + # Setup input + if input_type == "numpy": + input = np.frombuffer(compressed, dtype=np.uint8) + elif input_type == cramjam.File: + input = cramjam.File(str(tmpdir.join("input.txt"))) + input.write(compressed) + input.seek(0) + elif input_type == cramjam.Buffer: + input = cramjam.Buffer() + input.write(compressed) + input.seek(0) + else: + input = input_type(compressed) + + # Setup output buffer + if output_type == "numpy": + output = np.zeros(len(raw_data), dtype=np.uint8) + elif output_type == cramjam.File: + output = cramjam.File(str(tmpdir.join("output.txt"))) + elif output_type == cramjam.Buffer: + output = cramjam.Buffer() + else: + output = output_type(b"0" * len(raw_data)) + + n_bytes = variant.decompress_into(input, output) + assert n_bytes == len(raw_data) + + if hasattr(output, "read"): + output.seek(0) + output = output.read() + elif hasattr(output, "tobytes"): + output = output.tobytes() + else: + output = bytes(output) + assert same_same(output, raw_data) def test_variant_snappy_raw_into(): @@ -69,16 +157,13 @@ def test_variant_snappy_raw_into(): """ data = b"oh what a beautiful morning, oh what a beautiful day!!" * 1000000 + compressed = cramjam.snappy.compress_raw(data) compressed_size = cramjam.snappy.compress_raw_max_len(data) compressed_buffer = np.zeros(compressed_size, dtype=np.uint8) n_bytes = cramjam.snappy.compress_raw_into(data, compressed_buffer) - assert n_bytes == 2563328 + assert n_bytes == len(compressed) - decompressed_size = cramjam.snappy.decompress_raw_len( - compressed_buffer[:n_bytes].tobytes() - ) - assert decompressed_size == len(data) - decompressed_buffer = np.zeros(decompressed_size, dtype=np.uint8) + decompressed_buffer = np.zeros(len(data), dtype=np.uint8) n_bytes = cramjam.snappy.decompress_raw_into( compressed_buffer[:n_bytes].tobytes(), decompressed_buffer )