From 44439d961e9a0a5236175702bdcc0e4fac076f00 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Fri, 11 Oct 2024 11:52:01 +0200 Subject: [PATCH] fix: Properly handle non-nullable nested Parquet (#19192) --- .../arrow/read/deserialize/nested_utils.rs | 34 ++++++++++++++++--- py-polars/tests/unit/io/test_parquet.py | 29 ++++++++++++++++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index ab769848ca92..a622153dfca8 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -156,9 +156,35 @@ impl Nested { if is_valid && self.num_invalids != 0 { debug_assert!(!is_primitive); - let validity = self.validity.as_mut().unwrap(); - validity.extend_constant(self.num_valids, true); - validity.extend_constant(self.num_invalids, false); + // @NOTE: Having invalid items might not necessarily mean that we have a validity mask. + // + // For instance, if we have a optional struct with a required list in it, that struct + // will have a validity mask and the list will not. In the arrow representation of this + // array, however, the list will still have invalid items where the struct is null. + // + // Array: + // [ + // { 'x': [1] }, + // None, + // { 'x': [1, 2] }, + // ] + // + // Arrow: + // struct = [ list[0] None list[2] ] + // list = { + // values = [ 1, 1, 2 ], + // offsets = [ 0, 1, 1, 3 ], + // } + // + // Parquet: + // [ 1, 1, 2 ] + definition + repetition levels + // + // As you can see we need to insert an invalid item into the list even though it does + // not have a validity mask. + if let Some(validity) = self.validity.as_mut() { + validity.extend_constant(self.num_valids, true); + validity.extend_constant(self.num_invalids, false); + } self.num_valids = 0; self.num_invalids = 0; @@ -174,8 +200,6 @@ impl Nested { } fn push_default(&mut self, length: i64) { - debug_assert!(self.validity.is_some()); - let is_primitive = matches!(self.content, NestedContent::Primitive); self.num_invalids += usize::from(!is_primitive); diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 8431c659cce0..850bf61d978b 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -625,6 +625,10 @@ def test_parquet_rle_non_nullable_12814() -> None: pq.write_table(table, f, data_page_size=1) f.seek(0) + print(pq.read_table(f)) + + f.seek(0) + expect = pl.DataFrame(table).tail(10) actual = pl.read_parquet(f).tail(10) @@ -1961,3 +1965,28 @@ def test_allow_missing_columns( .collect(streaming=streaming), expected, ) + + +def test_nested_nonnullable_19158() -> None: + # Bug is based on the top-level struct being nullable and the inner list + # not being nullable. + tbl = pa.table( + { + "a": [{"x": [1]}, None, {"x": [1, 2]}, None], + }, + schema=pa.schema( + [ + pa.field( + "a", + pa.struct([pa.field("x", pa.list_(pa.int8()), nullable=False)]), + nullable=True, + ) + ] + ), + ) + + f = io.BytesIO() + pq.write_table(tbl, f) + + f.seek(0) + assert_frame_equal(pl.read_parquet(f), pl.DataFrame(tbl))