Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Properly handle to_physical_repr of nested types #20413

Merged
merged 5 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ArrayChunked {
Expand Down Expand Up @@ -29,6 +31,41 @@ impl ArrayChunked {
fld.coerce(DataType::Array(Box::new(inner_dtype), width))
}

/// Convert the datatype of the array into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ArrayChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());

let width = self.width();
let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
FixedSizeListArray::new(
ArrowDataType::FixedSizeList(
Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
)),
width,
),
chunk.len(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::Array(Box::new(self.inner_dtype().to_physical()), width);
Cow::Owned(unsafe { ArrayChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Convert a non-logical [`ArrayChunked`] back into a logical [`ArrayChunked`] without casting.
///
/// # Safety
Expand Down
33 changes: 33 additions & 0 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! Special list utility methods
pub(super) mod iterator;

use std::borrow::Cow;

use crate::prelude::*;

impl ListChunked {
Expand Down Expand Up @@ -36,6 +38,37 @@ impl ListChunked {
fld.coerce(DataType::List(Box::new(inner_dtype)))
}

/// Convert the datatype of the list into the physical datatype.
pub fn to_physical_repr(&self) -> Cow<ListChunked> {
let Cow::Owned(physical_repr) = self.get_inner().to_physical_repr() else {
return Cow::Borrowed(self);
};

assert_eq!(self.chunks().len(), physical_repr.chunks().len());

let chunks: Vec<_> = self
.downcast_iter()
.zip(physical_repr.into_chunks())
.map(|(chunk, values)| {
LargeListArray::new(
ArrowDataType::LargeList(Box::new(ArrowField::new(
PlSmallStr::from_static("item"),
values.dtype().clone(),
true,
))),
chunk.offsets().clone(),
values,
chunk.validity().cloned(),
)
.to_boxed()
})
.collect();

let name = self.name().clone();
let dtype = DataType::List(Box::new(self.inner_dtype().to_physical()));
Cow::Owned(unsafe { ListChunked::from_chunks_and_dtype_unchecked(name, chunks, dtype) })
}

/// Convert a non-logical [`ListChunked`] back into a logical [`ListChunked`] without casting.
///
/// # Safety
Expand Down
46 changes: 25 additions & 21 deletions crates/polars-core/src/chunked_array/ops/row_encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls(
))
}

pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContext> {
/// Get the [`RowEncodingContext`] for a certain [`DataType`].
///
/// This should be given the logical type in order to communicate Polars datatype information down
/// into the row encoding / decoding.
pub fn get_row_encoding_context(dtype: &DataType) -> Option<RowEncodingContext> {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a drive-by change. This name is much better for this function.

match dtype {
DataType::Boolean
| DataType::UInt8
Expand Down Expand Up @@ -104,8 +108,8 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},

#[cfg(feature = "dtype-array")]
DataType::Array(dtype, _) => get_row_encoding_dictionary(dtype),
DataType::List(dtype) => get_row_encoding_dictionary(dtype),
DataType::Array(dtype, _) => get_row_encoding_context(dtype),
DataType::List(dtype) => get_row_encoding_context(dtype),
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(revmap, ordering) | DataType::Enum(revmap, ordering) => {
let revmap = revmap.as_ref().unwrap();
Expand Down Expand Up @@ -161,28 +165,28 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option<RowEncodingContex
},
#[cfg(feature = "dtype-struct")]
DataType::Struct(fs) => {
let mut out = Vec::new();
let mut ctxts = Vec::new();

for (i, f) in fs.iter().enumerate() {
if let Some(dict) = get_row_encoding_dictionary(f.dtype()) {
out.reserve(fs.len());
out.extend(std::iter::repeat_n(None, i));
out.push(Some(dict));
if let Some(ctxt) = get_row_encoding_context(f.dtype()) {
ctxts.reserve(fs.len());
ctxts.extend(std::iter::repeat_n(None, i));
ctxts.push(Some(ctxt));
break;
}
}

if out.is_empty() {
if ctxts.is_empty() {
return None;
}

out.extend(
fs[out.len()..]
ctxts.extend(
fs[ctxts.len()..]
.iter()
.map(|f| get_row_encoding_dictionary(f.dtype())),
.map(|f| get_row_encoding_context(f.dtype())),
);

Some(RowEncodingContext::Struct(out))
Some(RowEncodingContext::Struct(ctxts))
},
}
}
Expand All @@ -198,7 +202,7 @@ pub fn encode_rows_unordered(by: &[Column]) -> PolarsResult<BinaryOffsetChunked>
pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -210,13 +214,13 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_unsorted();
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded(
Expand All @@ -229,7 +233,7 @@ pub fn _get_rows_encoded(

let mut cols = Vec::with_capacity(by.len());
let mut opts = Vec::with_capacity(by.len());
let mut dicts = Vec::with_capacity(by.len());
let mut ctxts = Vec::with_capacity(by.len());

// Since ZFS exists, we might not actually have any arrays and need to get the length from the
// columns.
Expand All @@ -241,13 +245,13 @@ pub fn _get_rows_encoded(
let by = by.as_materialized_series();
let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed();
let opt = RowEncodingOptions::new_sorted(*desc, *null_last);
let dict = get_row_encoding_dictionary(by.dtype());
let ctxt = get_row_encoding_context(by.dtype());

cols.push(arr);
opts.push(opt);
dicts.push(dict);
ctxts.push(ctxt);
}
Ok(convert_columns(num_rows, &cols, &opts, &dicts))
Ok(convert_columns(num_rows, &cols, &opts, &ctxts))
}

pub fn _get_rows_encoded_ca(
Expand Down
Loading
Loading