Allow the saving and loading to a byte string #6

psychomario · 2023-10-12T17:34:52Z

Implement the functions:

load_bytes(cls, data: bytes, hash_func: Callable[[Any], int]) -> Bloom: ...
def save_bytes(self) -> bytes: ...

to allow for saving and loading via Python bytes, for storage other than in a file.

Apologies if this is terrible Rust, I've never written Rust before today.

Dr-Emann · 2023-10-12T18:48:12Z

I'd like to make things use a common abstraction to load from a rust io::Read and write to an io::Write, something like:

Patch

commit 0c9258a
Author: Zachary Dremann <[email protected]>
Date:   Thu Oct 12 14:08:21 2023 -0400

    save/load via generic Read/Write

diff --git a/src/lib.rs b/src/lib.rs
index 4d7f63b..994e31a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,7 @@
 use bitline::BitLine;
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::types::PyType;
-use pyo3::{basic::CompareOp, prelude::*, types::PyTuple, types::PyBytes};
+use pyo3::{basic::CompareOp, prelude::*, types::PyBytes, types::PyTuple};
 use std::fs::File;
 use std::io::{Read, Write};
 use std::mem;
@@ -251,58 +251,13 @@ impl Bloom {
     /// Load from a file, see "Persistence" section in the README
     #[classmethod]
     fn load(_cls: &PyType, filepath: &str, hash_func: &PyAny) -> PyResult<Bloom> {
-        // check that the hash_func is callable
-        if !hash_func.is_callable() {
-            return Err(PyTypeError::new_err("hash_func must be callable"));
-        }
-        // check that the hash_func isn't the built-in hash function
-        if hash_func.is(get_builtin_hash_func(hash_func.py())?) {
-            return Err(PyValueError::new_err(
-                "Cannot load a bloom filter that uses the built-in hash function",
-            ));
-        }
-        let hash_func = Some(hash_func.to_object(hash_func.py()));
-
-        let mut file = File::open(filepath)?;
-
-        let mut k_bytes = [0; mem::size_of::<u64>()];
-        file.read_exact(&mut k_bytes)?;
-        let k = u64::from_le_bytes(k_bytes);
-
-        let filter = BitLine::load(&mut file)?;
-
-        Ok(Bloom {
-            filter,
-            k,
-            hash_func,
-        })
+        Self::load_from_read(hash_func, || File::open(filepath))
     }
 
     /// Load from a bytes(), see "Persistence" section in the README
     #[classmethod]
     fn load_bytes(_cls: &PyType, bytes: &[u8], hash_func: &PyAny) -> PyResult<Bloom> {
-        // check that the hash_func is callable
-        if !hash_func.is_callable() {
-            return Err(PyTypeError::new_err("hash_func must be callable"));
-        }
-        // check that the hash_func isn't the built-in hash function
-        if hash_func.is(get_builtin_hash_func(hash_func.py())?) {
-            return Err(PyValueError::new_err(
-                "Cannot load a bloom filter that uses the built-in hash function",
-            ));
-        }
-        let hash_func = Some(hash_func.to_object(hash_func.py()));
-
-        let k_bytes: [u8; mem::size_of::<u64>()] = bytes[0 .. mem::size_of::<u64>()].try_into().expect("slice with incorrect length");
-        let k = u64::from_le_bytes(k_bytes);
-
-        let filter = BitLine::load_bytes(&bytes[mem::size_of::<u64>()..])?;
-
-        Ok(Bloom {
-            filter,
-            k,
-            hash_func,
-        })
+        Self::load_from_read(hash_func, || Ok::<_, std::convert::Infallible>(bytes))
     }
 
     /// Save to a file, see "Persistence" section in the README
@@ -320,18 +275,10 @@ impl Bloom {
 
     /// Save to a byte(), see "Persistence" section in the README
     fn save_bytes(&self, py: Python<'_>) -> PyResult<PyObject> {
-        if self.hash_func.is_none() {
-            return Err(PyValueError::new_err(
-                "Cannot save a bloom filter that uses the built-in hash function",
-            ));
-        }
-
-        let serialized:Vec<u8> = [
-            &self.k.to_le_bytes(),
-            &self.filter.bits as &[u8]
-        ].concat();
-
-        Ok(PyBytes::new(py, &serialized).into())
+        let mut bytes =
+            Vec::with_capacity(mem::size_of_val(&self.k) + (self.filter.len() / 8) as usize);
+        self.save_to_write(|| Ok::<_, std::convert::Infallible>(&mut bytes))?;
+        Ok(PyBytes::new(py, &bytes).into())
     }
 }
 
@@ -341,6 +288,56 @@ impl Bloom {
         self.hash_func.as_ref().map(|f| f.clone_ref(py))
     }
 
+    fn load_from_read<F, R, E>(hash_func: &PyAny, get_data: F) -> PyResult<Self>
+    where
+        F: FnOnce() -> Result<R, E>,
+        R: Read,
+        PyErr: From<E>,
+    {
+        // check that the hash_func is callable
+        if !hash_func.is_callable() {
+            return Err(PyTypeError::new_err("hash_func must be callable"));
+        }
+        // check that the hash_func isn't the built-in hash function
+        if hash_func.is(get_builtin_hash_func(hash_func.py())?) {
+            return Err(PyValueError::new_err(
+                "Cannot load a bloom filter that uses the built-in hash function",
+            ));
+        }
+        let hash_func = Some(hash_func.to_object(hash_func.py()));
+
+        let mut data = get_data()?;
+
+        let mut k_bytes = [0; mem::size_of::<u64>()];
+        data.read_exact(&mut k_bytes)?;
+        let k = u64::from_le_bytes(k_bytes);
+
+        let filter = BitLine::load(data)?;
+
+        Ok(Bloom {
+            filter,
+            k,
+            hash_func,
+        })
+    }
+
+    fn save_to_write<F, W, E>(&self, get_writer: F) -> PyResult<()>
+    where
+        F: FnOnce() -> Result<W, E>,
+        W: Write,
+        PyErr: From<E>,
+    {
+        if self.hash_func.is_none() {
+            return Err(PyValueError::new_err(
+                "Cannot save a bloom filter that uses the built-in hash function",
+            ));
+        }
+        let mut writer = get_writer()?;
+        writer.write_all(&self.k.to_le_bytes())?;
+        self.filter.save(writer)?;
+        Ok(())
+    }
+
     fn zeroed_clone(&self, py: Python<'_>) -> Bloom {
         Bloom {
             filter: BitLine::new(self.filter.len()).unwrap(),
@@ -382,7 +379,6 @@ impl Bloom {
 mod bitline {
     use pyo3::exceptions::PyValueError;
     use pyo3::prelude::*;
-    use std::fs::File;
     use std::io::{Read, Write};
 
     #[inline(always)]
@@ -393,7 +389,7 @@ mod bitline {
 
     #[derive(Clone, PartialEq, Eq)]
     pub struct BitLine {
-        pub bits: Box<[u8]>,
+        bits: Box<[u8]>,
     }
 
     impl BitLine {
@@ -453,7 +449,7 @@ mod bitline {
 
         /// Reads the given file from the current position to the end and
         /// returns a BitLine containing the data.
-        pub fn load(file: &mut File) -> PyResult<Self> {
+        pub fn load<R: Read>(mut file: R) -> PyResult<Self> {
             let mut bits = Vec::new();
             file.read_to_end(&mut bits)?;
             Ok(Self {
@@ -461,18 +457,8 @@ mod bitline {
             })
         }
 
-        /// Given the provided [u8]
-        /// returns a BitLine containing the data.
-        pub fn load_bytes(bytes: &[u8]) -> PyResult<Self> {
-            let bits = bytes.to_vec();
-            Ok(Self {
-                bits: bits.into_boxed_slice(),
-            })
-        }
-
-
         /// Writes the BitLine to the given file from the current position.
-        pub fn save(&self, file: &mut File) -> PyResult<()> {
+        pub fn save<W: Write>(&self, mut file: W) -> PyResult<()> {
             file.write_all(&self.bits)?;
             Ok(())
         }

We might also want to consider doing something like allowing any of str, bytes, or any python file-like to the load function instead of adding a new method.

psychomario · 2023-10-12T20:33:20Z

I rolled in your suggestions for the common abstraction, I wasn't sure how to do that so thanks!

I've also implemented file-like objects using omerbenamram/pyo3-file, now all 3 of the following work as expected:

bf.save("./test.bin")
fnval = open("test.bin", "rb").read()

bioout = io.BytesIO()
bf.save(bioout)
bioout.seek(0)
bioval = bioout.read()

byteval = bf.save()

assert fnval == bioval
assert fnval == byteval

I'll look into doing the same for load() some time tomorrow, and tidying up the matches for the different argument types.

psychomario · 2023-10-14T11:47:18Z

I've now implemented load(), I'm not entirely sure using a Vec<u8> is the right way to do it but I was having borrower issues with an &[u8] and this seems to work.

The stub typedef is technically incorrect for save as source is fully optional and shouldn't default to None, but I don't know how to write that and seems to work file.

Dr-Emann · 2023-10-15T01:30:17Z

Hoping we can get this change I propopsed to pyo3-file which would let us derive FromPyObject like this: https://github.com/omerbenamram/pyo3-file/pull/14/files#diff-05ee6040af62295209b4d9841e78686a6aa3849102be3cc5656820d18157903aR10-R19.

Dr-Emann · 2023-10-15T01:31:38Z

rbloom.pyi

-    # save to file, see section "Persistence"
-    def save(self, filepath: str) -> None: ...
+    # save to file, file object, or return bytes, see section "Persistence"
+    def save(self, source: Optional[Union[str, BinaryIO]] = None) -> Union[bytes, None]: ...


You can type the stub more accurately with the @overload decorator:

import typing.overload @overload def save(self, dest: Union[str, BinaryIO]) -> None: ... @overload def save(self, dest: None = None) -> bytes: ...

Also, when we can update to pyo3 0.20, using PathBuf will mean we also can take os.PathLike (PyO3/pyo3#3374)

src/lib.rs

psychomario · 2023-10-15T11:07:34Z

Thanks for the additional feedback
Added the changes bar pyo3 bump (PathBuf).
Taking the pyo3_file changes from your branch for testing, until it's merged.

KenanHanke · 2023-10-15T11:46:59Z

Hey guys, just wanted to let you know that I won't be able to take a look at the suggested changes this week due to a looming deadline, but that I definitely will as soon as I can and that I am always super grateful for every contribution!!

KenanHanke · 2023-12-13T10:42:53Z

The API

Hi, I'm not 100% comfortable with the new API yet. My main issue lies with the fact that the save method currently has two completely different syntaxes, one that takes a filename and returns nothing and one that takes nothing and returns bytes. This seems like it's crying for people to forget to pass arguments and/or forget to use the return value and then fail in confusing ways without raising an error. The load method seems mostly fine, though some might debate that taking a str and bytes for the same argument with a very different significance to the function execution might also be confusing in some cases. I also thought about additional options, like implementing a __bytes__ or completely changing to to_bytes and from_bytes mimicking the respective native int methods, but right now I feel that save_bytes and load_bytes, as they were originally suggested by @psychomario , would probably integrate with the existing API the best.

The abstraction

I also believe that adding the abstraction on top, while technically elegant, would introduce an additional layer of complexity that makes the code less accessible. I appreciate @Dr-Emann 's obvious affinity to this kind of elegant complexity, but personally feel that the abstraction doesn't really improve code quality (by my subjective point of view) in a meaningful way.

Conclusion

I want to give you guys an opportunity to voice your opinions before I make my final decision, but right now I'm thinking about making some minor, inconsequential changes to @psychomario 's original two commits myself (including adding a small portion to the README) and then only committing those without the abstraction.

KenanHanke · 2024-01-19T17:13:52Z

I merged the original two commits (save_bytes & load_bytes implementation as well as tests), made some minor modifications to the code, and wrote documentation. I am therefore now closing this. Thank you for the contribution!

Rory McNamara added 2 commits October 12, 2023 18:27

implement save_bytes and load_bytes

4825611

add test for save_bytes & load_bytes

7c5f821

Rory McNamara added 2 commits October 12, 2023 20:17

add save and load abstractions

c1f067a

implement save for filepath, filelike object and bytes

429c0ff

Rory McNamara added 2 commits October 14, 2023 12:37

implement load() for path or fileobj or bytes

8e6922d

fixup comments, add fileobj test

fa05d1e

Dr-Emann reviewed Oct 15, 2023

View reviewed changes

latest changes, using pyo3_file fork temporarily

e6575b9

KenanHanke closed this Jan 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Allow the saving and loading to a byte string #6

Allow the saving and loading to a byte string #6

psychomario commented Oct 12, 2023

Dr-Emann commented Oct 12, 2023 •

edited

Loading

psychomario commented Oct 12, 2023

psychomario commented Oct 14, 2023

Dr-Emann commented Oct 15, 2023

Dr-Emann Oct 15, 2023

Dr-Emann Oct 15, 2023

psychomario commented Oct 15, 2023

KenanHanke commented Oct 15, 2023

KenanHanke commented Dec 13, 2023

KenanHanke commented Jan 19, 2024

Allow the saving and loading to a byte string #6

Allow the saving and loading to a byte string #6

Conversation

psychomario commented Oct 12, 2023

Dr-Emann commented Oct 12, 2023 • edited Loading

psychomario commented Oct 12, 2023

psychomario commented Oct 14, 2023

Dr-Emann commented Oct 15, 2023

Dr-Emann Oct 15, 2023

Choose a reason for hiding this comment

Dr-Emann Oct 15, 2023

Choose a reason for hiding this comment

psychomario commented Oct 15, 2023

KenanHanke commented Oct 15, 2023

KenanHanke commented Dec 13, 2023

KenanHanke commented Jan 19, 2024

Dr-Emann commented Oct 12, 2023 •

edited

Loading