Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed an issue with dimension separators #19

Merged
merged 2 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/async_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ where
pos,
&cols,
self.meta.get_real_dims(pos),
self.meta.get_separators(),
self.meta.get_chunk_patterns(),
)
.await;

Expand Down
73 changes: 56 additions & 17 deletions src/async_reader/zarr_read_async.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use std::collections::HashMap;
use std::fs::{read, read_to_string};
use std::sync::Arc;

use crate::reader::metadata::ChunkSeparator;
use crate::reader::metadata::{ChunkPattern, ChunkSeparator};
use crate::reader::{ZarrError, ZarrResult};
use crate::reader::{ZarrInMemoryChunk, ZarrStoreMetadata};

Expand All @@ -40,7 +40,7 @@ pub trait ZarrReadAsync<'a> {
position: &'a [usize],
cols: &'a [String],
real_dims: Vec<usize>,
separators: HashMap<String, ChunkSeparator>,
patterns: HashMap<String, ChunkPattern>,
) -> ZarrResult<ZarrInMemoryChunk>;
}

Expand Down Expand Up @@ -99,26 +99,49 @@ impl<'a> ZarrReadAsync<'a> for ZarrPath {
position: &'a [usize],
cols: &'a [String],
real_dims: Vec<usize>,
separators: HashMap<String, ChunkSeparator>,
patterns: HashMap<String, ChunkPattern>,
) -> ZarrResult<ZarrInMemoryChunk> {
let mut chunk = ZarrInMemoryChunk::new(real_dims);
for var in cols {
let s: Vec<String> = position.iter().map(|i| i.to_string()).collect();
let separator = separators
let pattern = patterns
.get(var.as_str())
.ok_or(ZarrError::InvalidMetadata(
"Could not find separator for column".to_string(),
))?;

let p = match separator {
ChunkSeparator::Period => self.location.child(var.to_string()).child(s.join(".")),
ChunkSeparator::Slash => {
let mut partial_path = self.location.child(var.to_string()).child("c");
for idx in s {
partial_path = partial_path.child(idx);
let p = match pattern {
ChunkPattern {
separator: sep,
c_prefix: false,
} => match sep {
ChunkSeparator::Period => {
self.location.child(var.to_string()).child(s.join("."))
}
partial_path
}
ChunkSeparator::Slash => {
let mut path = self.location.child(var.to_string());
for idx in s {
path = path.child(idx);
}
path
}
},
ChunkPattern {
separator: sep,
c_prefix: true,
} => match sep {
ChunkSeparator::Period => self
.location
.child(var.to_string())
.child("c.".to_string() + &s.join(".")),
ChunkSeparator::Slash => {
let mut path = self.location.child(var.to_string()).child("c");
for idx in s {
path = path.child(idx);
}
path
}
},
};
let get_res = self.store.get(&p).await?;
let data = match get_res.payload {
Expand Down Expand Up @@ -165,7 +188,10 @@ mod zarr_read_async_tests {
&ZarrArrayMetadata::new(
2,
ZarrDataType::UInt(1),
ChunkSeparator::Period,
ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false
},
None,
vec![ZarrCodec::Bytes(Endianness::Little)],
)
Expand All @@ -175,7 +201,10 @@ mod zarr_read_async_tests {
&ZarrArrayMetadata::new(
2,
ZarrDataType::Float(8),
ChunkSeparator::Period,
ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false
},
None,
vec![ZarrCodec::Bytes(Endianness::Little)],
)
Expand All @@ -197,7 +226,7 @@ mod zarr_read_async_tests {
&pos,
meta.get_columns(),
meta.get_real_dims(&pos),
meta.get_separators(),
meta.get_chunk_patterns(),
)
.await
.unwrap();
Expand All @@ -214,7 +243,12 @@ mod zarr_read_async_tests {
let col_proj = ZarrProjection::skip(vec!["float_data".to_string()]);
let cols = col_proj.apply_selection(meta.get_columns()).unwrap();
let chunk = store
.get_zarr_chunk(&pos, &cols, meta.get_real_dims(&pos), meta.get_separators())
.get_zarr_chunk(
&pos,
&cols,
meta.get_real_dims(&pos),
meta.get_chunk_patterns(),
)
.await
.unwrap();
assert_eq!(
Expand All @@ -226,7 +260,12 @@ mod zarr_read_async_tests {
let col_proj = ZarrProjection::keep(vec!["float_data".to_string()]);
let cols = col_proj.apply_selection(meta.get_columns()).unwrap();
let chunk = store
.get_zarr_chunk(&pos, &cols, meta.get_real_dims(&pos), meta.get_separators())
.get_zarr_chunk(
&pos,
&cols,
meta.get_real_dims(&pos),
meta.get_chunk_patterns(),
)
.await
.unwrap();
assert_eq!(
Expand Down
2 changes: 1 addition & 1 deletion src/datafusion/table_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use datafusion::{

use super::table_provider::{ListingZarrTableConfig, ListingZarrTableOptions, ZarrTableProvider};

struct ZarrListingTableFactory {}
pub struct ZarrListingTableFactory {}

#[async_trait]
impl TableProviderFactory for ZarrListingTableFactory {
Expand Down
83 changes: 69 additions & 14 deletions src/reader/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,20 @@ impl FromStr for ChunkSeparator {
}
}

// Struct for the chunk file pattern
#[derive(Debug, PartialEq, Clone)]
pub struct ChunkPattern {
pub(crate) separator: ChunkSeparator,
pub(crate) c_prefix: bool,
}

/// The metadata for a single zarr array, which holds various parameters
/// for the data stored in the array.
#[derive(Debug, PartialEq, Clone)]
pub struct ZarrArrayMetadata {
zarr_format: u8,
data_type: ZarrDataType,
chunk_separator: ChunkSeparator,
chunk_pattern: ChunkPattern,
sharding_options: Option<ShardingOptions>,
codecs: Vec<ZarrCodec>,
}
Expand All @@ -165,14 +172,14 @@ impl ZarrArrayMetadata {
pub(crate) fn new(
zarr_format: u8,
data_type: ZarrDataType,
chunk_separator: ChunkSeparator,
chunk_pattern: ChunkPattern,
sharding_options: Option<ShardingOptions>,
codecs: Vec<ZarrCodec>,
) -> Self {
Self {
zarr_format,
data_type,
chunk_separator,
chunk_pattern,
sharding_options,
codecs,
}
Expand All @@ -186,8 +193,8 @@ impl ZarrArrayMetadata {
self.sharding_options.clone()
}

pub(crate) fn get_separator(&self) -> ChunkSeparator {
self.chunk_separator.clone()
pub(crate) fn get_chunk_pattern(&self) -> ChunkPattern {
self.chunk_pattern.clone()
}
}

Expand Down Expand Up @@ -248,6 +255,22 @@ fn extract_string_from_json(map: &Value, key: &str, err_str: &str) -> ZarrResult
Ok(res.to_string())
}

fn extract_optional_string_from_json(
map: &Value,
key: &str,
err_str: &str,
) -> ZarrResult<Option<String>> {
let res = map.get(key);
if let Some(val) = res {
let val = val
.as_str()
.ok_or(ZarrError::InvalidMetadata(err_str.to_string()))?;
return Ok(Some(val.to_string()));
}

Ok(None)
}

fn extract_u64_from_json(map: &Value, key: &str, err_str: &str) -> ZarrResult<u64> {
let res = map
.get(key)
Expand Down Expand Up @@ -361,6 +384,12 @@ impl ZarrStoreMetadata {
let dtype = extract_string_from_json(&meta_map, "dtype", error_string)?;
let data_type = extract_type_v2(&dtype)?;

// parse dimenstion separator
let error_string = "error parsing metadata dimension separator";
let maybe_sep =
extract_optional_string_from_json(&meta_map, "dimension_separator", error_string)?;
let dim_separator = maybe_sep.unwrap_or(".".to_string());

// parse endianness
let endianness = match dtype.chars().next().unwrap() {
'<' | '|' => Endianness::Little,
Expand Down Expand Up @@ -407,7 +436,10 @@ impl ZarrStoreMetadata {
let array_meta = ZarrArrayMetadata {
zarr_format: 2,
data_type,
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::from_str(&dim_separator)?,
c_prefix: false,
},
sharding_options: None,
codecs,
};
Expand Down Expand Up @@ -623,6 +655,11 @@ impl ZarrStoreMetadata {
let (_, config) = extract_config(chunk_key_encoding)?;
let chunk_key_encoding = extract_string_from_json(config, "separator", error_string)?;
let chunk_key_encoding = ChunkSeparator::from_str(&chunk_key_encoding)?;
let c_prefix = chunk_key_encoding == ChunkSeparator::Slash;
let chunk_key_encoding = ChunkPattern {
separator: chunk_key_encoding,
c_prefix,
};

// codecs
let codec_configs = meta_map
Expand Down Expand Up @@ -766,12 +803,12 @@ impl ZarrStoreMetadata {
self.chunks.as_ref().unwrap()
}

pub(crate) fn get_separators(&self) -> HashMap<String, ChunkSeparator> {
pub(crate) fn get_chunk_patterns(&self) -> HashMap<String, ChunkPattern> {
let mut m = HashMap::new();
for col in &self.columns {
m.insert(
col.to_string(),
self.get_array_meta(col).unwrap().get_separator(),
self.get_array_meta(col).unwrap().get_chunk_pattern(),
);
}

Expand Down Expand Up @@ -857,7 +894,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand All @@ -876,7 +916,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::TimeStamp(8, "ms".to_string()),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand All @@ -895,7 +938,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::Bool,
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Transpose(vec![1, 0]),
Expand All @@ -909,7 +955,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 2,
data_type: ZarrDataType::FixedLengthString(112),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Big),
Expand Down Expand Up @@ -1077,7 +1126,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 3,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Slash,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Slash,
c_prefix: true,
},
sharding_options: None,
codecs: vec![
ZarrCodec::Bytes(Endianness::Little),
Expand Down Expand Up @@ -1143,7 +1195,10 @@ mod zarr_metadata_v3_tests {
ZarrArrayMetadata {
zarr_format: 3,
data_type: ZarrDataType::Int(4),
chunk_separator: ChunkSeparator::Period,
chunk_pattern: ChunkPattern {
separator: ChunkSeparator::Period,
c_prefix: false,
},
sharding_options: Some(ShardingOptions::new(
vec![4, 4],
vec![2, 2],
Expand Down
2 changes: 1 addition & 1 deletion src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ impl<T: ZarrRead> ZarrIterator for ZarrStore<T> {
pos,
&cols,
self.meta.get_real_dims(pos),
self.meta.get_separators(),
self.meta.get_chunk_patterns(),
);
self.curr_chunk += 1;
Some(chnk)
Expand Down
Loading