From 820843ff597161c9cdacd0e79cecf20d05755081 Mon Sep 17 00:00:00 2001 From: Edmondo Porcu Date: Mon, 8 Apr 2024 06:31:10 -0400 Subject: [PATCH] Removes Bloom filter for Int8/Int16/Uint8/Uint16 (#9969) * Removing broken tests * Simplifying tests / removing support for failed tests * Revert "Simplifying tests / removing support for failed tests" This reverts commit 6e50a8064436943d9f42d313cef2c2b017d196f1. * Fixing tests for real * Apply suggestions from code review Thanks @alamb ! Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- .../physical_plan/parquet/row_groups.rs | 4 -- .../core/tests/parquet/row_group_pruning.rs | 54 +++++++++---------- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs index 8df4925fc566..6600dd07d7fd 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs @@ -232,12 +232,8 @@ impl PruningStatistics for BloomFilterStatistics { ScalarValue::Float32(Some(v)) => sbbf.check(v), ScalarValue::Int64(Some(v)) => sbbf.check(v), ScalarValue::Int32(Some(v)) => sbbf.check(v), - ScalarValue::Int16(Some(v)) => sbbf.check(v), - ScalarValue::Int8(Some(v)) => sbbf.check(v), ScalarValue::UInt64(Some(v)) => sbbf.check(v), ScalarValue::UInt32(Some(v)) => sbbf.check(v), - ScalarValue::UInt16(Some(v)) => sbbf.check(v), - ScalarValue::UInt8(Some(v)) => sbbf.check(v), ScalarValue::Decimal128(Some(v), p, s) => match parquet_type { Type::INT32 => { //https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42 diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index b7b434d1c3d3..8fc7936552af 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -290,7 +290,7 @@ async fn prune_disabled() { // https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass // if and only if Bloom filters on Int8 and Int16 columns are still buggy. macro_rules! int_tests { - ($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => { + ($bits:expr) => { paste::item! { #[tokio::test] async fn []() { @@ -329,9 +329,9 @@ macro_rules! int_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -343,9 +343,9 @@ macro_rules! int_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -404,9 +404,9 @@ macro_rules! int_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -447,17 +447,16 @@ macro_rules! int_tests { }; } -int_tests!(8, correct_bloom_filters: false); -int_tests!(16, correct_bloom_filters: false); -int_tests!(32, correct_bloom_filters: true); -int_tests!(64, correct_bloom_filters: true); +// int8/int16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779 +int_tests!(32); +int_tests!(64); // $bits: number of bits of the integer to test (8, 16, 32, 64) // $correct_bloom_filters: if false, replicates the // https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass // if and only if Bloom filters on UInt8 and UInt16 columns are still buggy. macro_rules! uint_tests { - ($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => { + ($bits:expr) => { paste::item! { #[tokio::test] async fn []() { @@ -482,9 +481,9 @@ macro_rules! uint_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -496,9 +495,9 @@ macro_rules! uint_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -542,9 +541,9 @@ macro_rules! uint_tests { .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) - .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) - .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) - .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .with_matched_by_bloom_filter(Some(1)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(1) .test_row_group_prune() .await; } @@ -585,10 +584,9 @@ macro_rules! uint_tests { }; } -uint_tests!(8, correct_bloom_filters: false); -uint_tests!(16, correct_bloom_filters: false); -uint_tests!(32, correct_bloom_filters: true); -uint_tests!(64, correct_bloom_filters: true); +// uint8/uint16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779 +uint_tests!(32); +uint_tests!(64); #[tokio::test] async fn prune_int32_eq_large_in_list() {