From e1fb10c7bd1b3d422e11615a74059a4adadc2dbf Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 12 Nov 2024 08:40:44 +0100 Subject: [PATCH 1/2] Improve default implementation of Array::is_nullable Since is_nullable returns `false` if the array is guaranteed to not contain any logical nulls, the correct default implementation could leverage `self.logical_null_count` more appropriately than `self.null_count`. To reduce chance of negative surprises in downstream projects, we could introduce the new behavior under `is_logically_nullable` name and deprecate `is_nullable` without changing it. --- arrow-array/src/array/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 04d9883f5bd8..87577166ea3d 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -317,8 +317,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key, /// and therefore would not appear in [`Array::logical_nulls`]. fn is_nullable(&self) -> bool { - // TODO this is not necessarily perfect default implementation, since null_count() and logical_null_count() are not always equivalent - self.null_count() != 0 + self.logical_null_count() != 0 } /// Returns the total number of bytes of memory pointed to by this array. From 9e6fbf8f434eeb5ffcf56aa2162f957b2f8ac62a Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Sat, 16 Nov 2024 22:53:36 +0100 Subject: [PATCH 2/2] Implement efficient DictionaryArray::logical_null_count --- arrow-array/src/array/dictionary_array.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 920be65fcf5f..55ecd56f987e 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -749,6 +749,26 @@ impl Array for DictionaryArray { } } + fn logical_null_count(&self) -> usize { + match (self.keys.nulls(), self.values.logical_nulls()) { + (None, None) => 0, + (Some(key_nulls), None) => key_nulls.null_count(), + (None, Some(value_nulls)) => self + .keys + .values() + .iter() + .filter(|k| value_nulls.is_null(k.as_usize())) + .count(), + (Some(key_nulls), Some(value_nulls)) => self + .keys + .values() + .iter() + .enumerate() + .filter(|(idx, k)| key_nulls.is_null(*idx) || value_nulls.is_null(k.as_usize())) + .count(), + } + } + fn is_nullable(&self) -> bool { !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable()) }