Skip to content

Commit

Permalink
refactor!: change RawBytesOffsets into a validated newtype (#137)
Browse files Browse the repository at this point in the history
Closes #132
  • Loading branch information
LDeakin authored Jan 22, 2025
1 parent 091019b commit 177f701
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `ArrayToBytesCodecTraits::decode_into`
- `zarrs::array::copy_fill_value_into`
- `zarrs::array::update_array_bytes`
- **Breaking**: change `RawBytesOffsets` into a validated newtype

## [0.19.1] - 2025-01-19

Expand Down
2 changes: 1 addition & 1 deletion zarrs/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub use self::{
array_builder::ArrayBuilder,
array_bytes::{
copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes,
RawBytesOffsets,
RawBytesOffsets, RawBytesOffsetsCreateError,
},
array_bytes_fixed_disjoint_view::{
ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError,
Expand Down
47 changes: 29 additions & 18 deletions zarrs/src/array/array_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ use super::{
ravel_indices, ArrayBytesFixedDisjointView, ArraySize, DataType, FillValue,
};

mod raw_bytes_offsets;
pub use raw_bytes_offsets::{RawBytesOffsets, RawBytesOffsetsCreateError};

/// Array element bytes.
///
/// These can represent:
Expand All @@ -23,11 +26,6 @@ use super::{
/// - Encoded array bytes after an array to bytes or bytes to bytes codecs.
pub type RawBytes<'a> = Cow<'a, [u8]>;

/// Array element byte offsets.
///
/// These must be monotonically increasing. See [`ArrayBytes::Variable`].
pub type RawBytesOffsets<'a> = Cow<'a, [usize]>; // FIXME: Switch to a validated newtype in zarrs 0.20

/// Fixed or variable length array bytes.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ArrayBytes<'a> {
Expand Down Expand Up @@ -60,15 +58,10 @@ impl<'a> ArrayBytes<'a> {
}

/// Create a new variable length array bytes from `bytes` and `offsets`.
pub fn new_vlen(
bytes: impl Into<RawBytes<'a>>,
offsets: impl Into<RawBytesOffsets<'a>>, // FIXME: TryInto
) -> Self {
Self::Variable(bytes.into(), offsets.into())
pub fn new_vlen(bytes: impl Into<RawBytes<'a>>, offsets: RawBytesOffsets<'a>) -> Self {
Self::Variable(bytes.into(), offsets)
}

// TODO: new_vlen_unchecked

/// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`.
///
/// # Panics
Expand All @@ -85,12 +78,14 @@ impl<'a> ArrayBytes<'a> {
}
ArraySize::Variable { num_elements } => {
let num_elements = usize::try_from(num_elements).unwrap();
Self::new_vlen(
fill_value.as_ne_bytes().repeat(num_elements),
(0..=num_elements)
.map(|i| i * fill_value.size())
.collect::<Vec<_>>(),
)
Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(
(0..=num_elements)
.map(|i| i * fill_value.size())
.collect::<Vec<_>>(),
)
})
}
}
}
Expand Down Expand Up @@ -207,6 +202,10 @@ impl<'a> ArrayBytes<'a> {
ss_bytes.extend_from_slice(&bytes[curr..next]);
}
ss_offsets.push(ss_bytes.len());
let ss_offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(ss_offsets)
};
Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets))
}
ArrayBytes::Fixed(bytes) => {
Expand Down Expand Up @@ -334,6 +333,10 @@ pub(crate) fn update_bytes_vlen<'a>(
}
}
offsets_new.push(bytes_new.len());
let offsets_new = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

Ok(ArrayBytes::new_vlen(bytes_new, offsets_new))
}
Expand Down Expand Up @@ -438,6 +441,10 @@ pub(crate) fn merge_chunks_vlen<'a>(
*acc += sz;
Some(*acc)
}));
let offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets)
};

// Write bytes
// TODO: Go parallel
Expand Down Expand Up @@ -485,6 +492,10 @@ pub(crate) fn extract_decoded_regions_vlen<'a>(
region_bytes.extend_from_slice(&bytes[curr..next]);
}
region_offsets.push(region_bytes.len());
let region_offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(region_offsets)
};
out.push(ArrayBytes::new_vlen(region_bytes, region_offsets));
}
Ok(out)
Expand Down
108 changes: 108 additions & 0 deletions zarrs/src/array/array_bytes/raw_bytes_offsets.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
use std::{borrow::Cow, ops::Deref};

use derive_more::derive::Display;
use thiserror::Error;

/// Array element byte offsets.
///
/// These must be monotonically increasing. See [`ArrayBytes::Variable`](crate::array::ArrayBytes::Variable).
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct RawBytesOffsets<'a>(Cow<'a, [usize]>);

impl Deref for RawBytesOffsets<'_> {
type Target = [usize];

fn deref(&self) -> &Self::Target {
&self.0
}
}

/// An error creating [`RawBytesOffsets`].
///
/// This error occurs when the offsets are not monotonically increasing.
#[derive(Debug, Error, Display)]
pub struct RawBytesOffsetsCreateError;

impl<'a> RawBytesOffsets<'a> {
/// Creates a new `RawBytesOffsets`.
///
/// # Errors
/// Returns an error if the offsets are not monotonically increasing.
pub fn new(offsets: impl Into<Cow<'a, [usize]>>) -> Result<Self, RawBytesOffsetsCreateError> {
let offsets = offsets.into();
if offsets.windows(2).all(|w| w[1] >= w[0]) {
Ok(Self(offsets))
} else {
Err(RawBytesOffsetsCreateError)
}
}

/// Creates a new `RawBytesOffsets` without checking the offsets.
///
/// # Safety
/// The offsets must be monotonically increasing.
#[must_use]
pub unsafe fn new_unchecked(offsets: impl Into<Cow<'a, [usize]>>) -> Self {
let offsets = offsets.into();
debug_assert!(offsets.windows(2).all(|w| w[1] >= w[0]));
Self(offsets)
}

/// Clones the offsets if not already owned.
#[must_use]
pub fn into_owned(self) -> RawBytesOffsets<'static> {
RawBytesOffsets(self.0.into_owned().into())
}
}

impl<'a> TryFrom<Cow<'a, [usize]>> for RawBytesOffsets<'a> {
type Error = RawBytesOffsetsCreateError;

fn try_from(value: Cow<'a, [usize]>) -> Result<Self, Self::Error> {
Self::new(value)
}
}

impl<'a> TryFrom<&'a [usize]> for RawBytesOffsets<'a> {
type Error = RawBytesOffsetsCreateError;

fn try_from(value: &'a [usize]) -> Result<Self, Self::Error> {
Self::new(value)
}
}

impl<'a, const N: usize> TryFrom<&'a [usize; N]> for RawBytesOffsets<'a> {
type Error = RawBytesOffsetsCreateError;

fn try_from(value: &'a [usize; N]) -> Result<Self, Self::Error> {
Self::new(value)
}
}

impl TryFrom<Vec<usize>> for RawBytesOffsets<'_> {
type Error = RawBytesOffsetsCreateError;

fn try_from(value: Vec<usize>) -> Result<Self, Self::Error> {
Self::new(value)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn raw_bytes_offsets() {
let offsets = RawBytesOffsets::new(vec![0, 1, 2, 3]).unwrap();
assert_eq!(&*offsets, &[0, 1, 2, 3]);
assert!(RawBytesOffsets::new(vec![0, 1, 1]).is_ok());
assert!(RawBytesOffsets::new(vec![0, 1, 0]).is_err());
assert!(RawBytesOffsets::try_from(vec![0, 1, 2]).is_ok());
assert!(RawBytesOffsets::try_from(vec![0, 1, 0]).is_err());
assert!(RawBytesOffsets::try_from([0, 1, 2].as_slice()).is_ok());
assert!(RawBytesOffsets::try_from([0, 1, 0].as_slice()).is_err());
assert!(RawBytesOffsets::try_from(&[0, 1, 2]).is_ok());
assert!(RawBytesOffsets::try_from(&[0, 1, 0]).is_err());
assert!(RawBytesOffsets::try_from(Cow::Owned(vec![0, 1, 0])).is_err());
}
}
9 changes: 6 additions & 3 deletions zarrs/src/array/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ use std::borrow::Cow;
use std::sync::Arc;

use super::{
concurrency::RecommendedConcurrency, BytesRepresentation, ChunkRepresentation, ChunkShape,
DataType,
array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes,
ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType,
RawBytes,
};
use super::{ArrayBytes, ArrayBytesFixedDisjointView, RawBytes};

/// A codec plugin.
pub type CodecPlugin = Plugin<Codec>;
Expand Down Expand Up @@ -1060,6 +1060,9 @@ pub enum CodecError {
/// Subset out of bounds.
#[error(transparent)]
SubsetOutOfBounds(#[from] SubsetOutOfBoundsError),
/// Invalid byte offsets for variable length data.
#[error(transparent)]
RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError),
}

impl From<&str> for CodecError {
Expand Down
4 changes: 4 additions & 0 deletions zarrs/src/array/codec/array_to_array/transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ fn transpose_vlen<'a>(
bytes_new.extend_from_slice(&bytes[curr..next]);
}
offsets_new.push(bytes_new.len());
let offsets_new = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

ArrayBytes::new_vlen(bytes_new, offsets_new)
}
Expand Down
8 changes: 5 additions & 3 deletions zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
CodecOptions, CodecTraits, RecommendedConcurrency,
},
transmute_to_bytes_vec, ArrayBytes, BytesRepresentation, ChunkRepresentation, CodecChain,
DataType, DataTypeSize, Endianness, FillValue, RawBytes,
DataType, DataTypeSize, Endianness, FillValue, RawBytes, RawBytesOffsets,
},
config::global_config,
metadata::v3::{array::codec::vlen::VlenIndexDataType, MetadataV3},
Expand Down Expand Up @@ -265,14 +265,16 @@ impl ArrayToBytesCodecTraits for VlenCodec {
}
}
.unwrap();
let (data, index) = super::get_vlen_bytes_and_offsets(
let (data, offsets) = super::get_vlen_bytes_and_offsets(
&index_chunk_rep,
&bytes,
&self.index_codecs,
&self.data_codecs,
options,
)?;
Ok(ArrayBytes::new_vlen(data, index))
let offsets = RawBytesOffsets::new(offsets)?;

Ok(ArrayBytes::new_vlen(data, offsets))
}

fn partial_decoder(
Expand Down
2 changes: 2 additions & 0 deletions zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::{
RecommendedConcurrency,
},
ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes,
RawBytesOffsets,
},
config::global_config,
metadata::v3::MetadataV3,
Expand Down Expand Up @@ -110,6 +111,7 @@ impl ArrayToBytesCodecTraits for VlenV2Codec {
) -> Result<ArrayBytes<'a>, CodecError> {
let num_elements = decoded_representation.num_elements_usize();
let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?;
let offsets = RawBytesOffsets::new(offsets)?;
Ok(ArrayBytes::new_vlen(bytes, offsets))
}

Expand Down
12 changes: 11 additions & 1 deletion zarrs/src/array/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ use std::mem::ManuallyDrop;
use itertools::Itertools;
use ArrayError::IncompatibleElementType as IET;

use super::{convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType};
use super::{
convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType, RawBytesOffsets,
};

/// A trait representing an array element type.
pub trait Element: Sized + Clone {
Expand Down Expand Up @@ -184,6 +186,10 @@ macro_rules! impl_element_string {
len = len.checked_add(element.len()).unwrap();
}
offsets.push(len);
let offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets)
};

// Concatenate bytes
let mut bytes = Vec::with_capacity(usize::try_from(len).unwrap());
Expand Down Expand Up @@ -238,6 +244,10 @@ macro_rules! impl_element_binary {
len = len.checked_add(element.len()).unwrap();
}
offsets.push(len);
let offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets)
};

// Concatenate bytes
let bytes = elements.concat();
Expand Down

0 comments on commit 177f701

Please sign in to comment.