apache · alamb · Nov 7, 2022 · Nov 7, 2022 · Nov 15, 2022 · Nov 16, 2022
diff --git a/benchmarks/src/bin/parquet_filter_pushdown.rs b/benchmarks/src/bin/parquet_filter_pushdown.rs
@@ -21,6 +21,7 @@ use datafusion::logical_expr::{lit, or, Expr};
 use datafusion::optimizer::utils::disjunction;
 use datafusion::physical_plan::collect;
 use datafusion::prelude::{col, SessionConfig, SessionContext};
+use parquet::file::properties::WriterProperties;
 use parquet_test_utils::{ParquetScanOptions, TestParquetFile};
 use std::path::PathBuf;
 use std::time::Instant;
@@ -73,7 +74,19 @@ async fn main() -> Result<()> {
 
     let path = opt.path.join("logs.parquet");
 
-    let test_file = gen_data(path, opt.scale_factor, opt.page_size, opt.row_group_size)?;
+    let mut props_builder = WriterProperties::builder();
+
+    if let Some(s) = opt.page_size {
+        props_builder = props_builder
+            .set_data_pagesize_limit(s)
+            .set_write_batch_size(s);
+    }
+
+    if let Some(s) = opt.row_group_size {
+        props_builder = props_builder.set_max_row_group_size(s);
+    }
+
+    let test_file = gen_data(path, opt.scale_factor, props_builder.build())?;
 
     run_benchmarks(&mut ctx, &test_file, opt.iterations, opt.debug).await?;
 
@@ -137,14 +150,9 @@ async fn run_benchmarks(
             println!("Using scan options {:?}", scan_options);
             for i in 0..iterations {
                 let start = Instant::now();
-                let rows = exec_scan(
-                    ctx,
-                    test_file,
-                    filter_expr.clone(),
-                    scan_options.clone(),
-                    debug,
-                )
-                .await?;
+                let rows =
+                    exec_scan(ctx, test_file, filter_expr.clone(), *scan_options, debug)
+                        .await?;
                 println!(
                     "Iteration {} returned {} rows in {} ms",
                     i,
@@ -179,17 +187,11 @@ async fn exec_scan(
 fn gen_data(
     path: PathBuf,
     scale_factor: f32,
-    page_size: Option<usize>,
-    row_group_size: Option<usize>,
+    props: WriterProperties,
 ) -> Result<TestParquetFile> {
     let generator = AccessLogGenerator::new();
 
     let num_batches = 100_f32 * scale_factor;
 
-    TestParquetFile::try_new(
-        path,
-        generator.take(num_batches as usize),
-        page_size,
-        row_group_size,
-    )
+    TestParquetFile::try_new(path, props, generator.take(num_batches as usize))
 }
diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -51,6 +51,7 @@ use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter};
 use datafusion_expr::utils::expr_to_columns;
 use datafusion_expr::{binary_expr, cast, try_cast, ExprSchemable};
 use datafusion_physical_expr::create_physical_expr;
+use log::trace;
 
 /// Interface to pass statistics information to [`PruningPredicate`]
 ///
@@ -415,6 +416,12 @@ fn build_statistics_record_batch<S: PruningStatistics>(
     let mut options = RecordBatchOptions::default();
     options.row_count = Some(statistics.num_containers());
 
+    trace!(
+        "Creating statistics batch for {:#?} with {:#?}",
+        required_columns,
+        arrays
+    );
+
     RecordBatch::try_new_with_options(schema, arrays, &options).map_err(|err| {
         DataFusionError::Plan(format!("Can not create statistics record batch: {}", err))
     })

diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs
@@ -643,7 +643,7 @@ struct RowGroupPruningStatistics<'a> {
 // Convert the bytes array to i128.
 // The endian of the input bytes array must be big-endian.
 // Copy from the arrow-rs
-fn from_bytes_to_i128(b: &[u8]) -> i128 {
+pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 {
     assert!(b.len() <= 16, "Decimal128Array supports only up to size 16");
     let first_bit = b[0] & 128u8 == 128u8;
     let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] };
@@ -773,7 +773,9 @@ macro_rules! get_null_count_values {
 
 // Convert parquet column schema to arrow data type, and just consider the
 // decimal data type.
-fn parquet_to_arrow_decimal_type(parquet_column: &ColumnDescriptor) -> Option<DataType> {
+pub(crate) fn parquet_to_arrow_decimal_type(
+    parquet_column: &ColumnDescriptor,
+) -> Option<DataType> {
     let type_ptr = parquet_column.self_type_ptr();
     match type_ptr.get_basic_info().logical_type() {
         Some(LogicalType::Decimal { scale, precision }) => {

diff --git a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
@@ -17,12 +17,17 @@
 
 //! Contains code to filter entire pages
 
-use arrow::array::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array};
+use arrow::array::{
+    BooleanArray, Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array,
+    StringArray,
+};
+use arrow::datatypes::DataType;
 use arrow::{array::ArrayRef, datatypes::SchemaRef, error::ArrowError};
 use datafusion_common::{Column, DataFusionError, Result};
 use datafusion_expr::utils::expr_to_columns;
 use datafusion_optimizer::utils::split_conjunction;
-use log::{debug, error};
+use log::{debug, error, trace};
+use parquet::schema::types::ColumnDescriptor;
 use parquet::{
     arrow::arrow_reader::{RowSelection, RowSelector},
     errors::ParquetError,
@@ -36,6 +41,9 @@ use std::collections::{HashSet, VecDeque};
 use std::sync::Arc;
 
 use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+use crate::physical_plan::file_format::parquet::{
+    from_bytes_to_i128, parquet_to_arrow_decimal_type,
+};
 
 use super::metrics::ParquetFileMetrics;
 
@@ -133,6 +141,7 @@ pub(crate) fn build_page_filter(
                             &predicate,
                             rg_offset_indexes.get(col_id),
                             rg_page_indexes.get(col_id),
+                            groups[*r].column(col_id).column_descr(),
                             file_metrics,
                         )
                         .map_err(|e| {
@@ -143,15 +152,19 @@ pub(crate) fn build_page_filter(
                         }),
                     );
                 } else {
+                    trace!(
+                        "Did not have enough metadata to prune with page indexes, falling back, falling back to all rows",
+                    );
                     // fallback select all rows
                     let all_selected =
                         vec![RowSelector::select(groups[*r].num_rows() as usize)];
                     selectors.push(all_selected);
                 }
             }
             debug!(
-                "Use filter and page index create RowSelection {:?} from predicate:{:?}",
-                &selectors, predicate
+                "Use filter and page index create RowSelection {:?} from predicate: {:?}",
+                &selectors,
+                predicate.predicate_expr(),
             );
             row_selections.push_back(selectors.into_iter().flatten().collect::<Vec<_>>());
         }
@@ -303,15 +316,18 @@ fn prune_pages_in_one_row_group(
     predicate: &PruningPredicate,
     col_offset_indexes: Option<&Vec<PageLocation>>,
     col_page_indexes: Option<&Index>,
+    col_desc: &ColumnDescriptor,
     metrics: &ParquetFileMetrics,
 ) -> Result<Vec<RowSelector>> {
     let num_rows = group.num_rows() as usize;
     if let (Some(col_offset_indexes), Some(col_page_indexes)) =
         (col_offset_indexes, col_page_indexes)
     {
+        let target_type = parquet_to_arrow_decimal_type(col_desc);
         let pruning_stats = PagesPruningStatistics {
             col_page_indexes,
             col_offset_indexes,
+            target_type: &target_type,
         };
 
         match predicate.prune(&pruning_stats) {
@@ -321,7 +337,7 @@ fn prune_pages_in_one_row_group(
                 assert_eq!(row_vec.len(), values.len());
                 let mut sum_row = *row_vec.first().unwrap();
                 let mut selected = *values.first().unwrap();
-
+                trace!("Pruned to to {:?} using {:?}", values, pruning_stats);
                 for (i, &f) in values.iter().skip(1).enumerate() {
                     if f == selected {
                         sum_row += *row_vec.get(i).unwrap();
@@ -376,9 +392,13 @@ fn create_row_count_in_each_page(
 
 /// Wraps one col page_index in one rowGroup statistics in a way
 /// that implements [`PruningStatistics`]
+#[derive(Debug)]
 struct PagesPruningStatistics<'a> {
     col_page_indexes: &'a Index,
     col_offset_indexes: &'a Vec<PageLocation>,
+    // target_type means the logical type in schema: like 'DECIMAL' is the logical type, but the
+    // real physical type in parquet file may be `INT32, INT64, FIXED_LEN_BYTE_ARRAY`
+    target_type: &'a Option<DataType>,
 }
 
 // Extract the min or max value calling `func` from page idex
@@ -387,16 +407,50 @@ macro_rules! get_min_max_values_for_page_index {
         match $self.col_page_indexes {
             Index::NONE => None,
             Index::INT32(index) => {
-                let vec = &index.indexes;
-                Some(Arc::new(Int32Array::from_iter(
-                    vec.iter().map(|x| x.$func().cloned()),
-                )))
+                match $self.target_type {
+                    // int32 to decimal with the precision and scale
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        let vec = &index.indexes;
+                        if let Ok(arr) = Decimal128Array::from_iter_values(
+                            vec.iter().map(|x| *x.$func().unwrap() as i128),
+                        )
+                        .with_precision_and_scale(*precision, *scale)
+                        {
+                            return Some(Arc::new(arr));
+                        } else {
+                            return None;
+                        }
+                    }
+                    _ => {
+                        let vec = &index.indexes;
+                        Some(Arc::new(Int32Array::from_iter(
+                            vec.iter().map(|x| x.$func().cloned()),
+                        )))
+                    }
+                }
             }
             Index::INT64(index) => {
-                let vec = &index.indexes;
-                Some(Arc::new(Int64Array::from_iter(
-                    vec.iter().map(|x| x.$func().cloned()),
-                )))
+                match $self.target_type {
+                    // int64 to decimal with the precision and scale
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        let vec = &index.indexes;
+                        if let Ok(arr) = Decimal128Array::from_iter_values(
+                            vec.iter().map(|x| *x.$func().unwrap() as i128),
+                        )
+                        .with_precision_and_scale(*precision, *scale)
+                        {
+                            return Some(Arc::new(arr));
+                        } else {
+                            return None;
+                        }
+                    }
+                    _ => {
+                        let vec = &index.indexes;
+                        Some(Arc::new(Int64Array::from_iter(
+                            vec.iter().map(|x| x.$func().cloned()),
+                        )))
+                    }
+                }
             }
             Index::FLOAT(index) => {
                 let vec = &index.indexes;
@@ -416,10 +470,37 @@ macro_rules! get_min_max_values_for_page_index {
                     vec.iter().map(|x| x.$func().cloned()),
                 )))
             }
-            Index::INT96(_) | Index::BYTE_ARRAY(_) | Index::FIXED_LEN_BYTE_ARRAY(_) => {
+            Index::BYTE_ARRAY(index) => {
+                let vec = &index.indexes;
+                let array: StringArray = vec
+                    .iter()
+                    .map(|x| x.$func())
+                    .map(|x| x.and_then(|x| std::str::from_utf8(x).ok()))
+                    .collect();
+                Some(Arc::new(array))
+            }
+            Index::INT96(_) => {
                 //Todo support these type
                 None
             }
+            Index::FIXED_LEN_BYTE_ARRAY(index) => {
+                match $self.target_type {
+                    // int32 to decimal with the precision and scale
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        let vec = &index.indexes;
+                        if let Ok(array) = Decimal128Array::from_iter_values(
+                            vec.iter().map(|x| from_bytes_to_i128(x.$func().unwrap())),
+                        )
+                        .with_precision_and_scale(*precision, *scale)
+                        {
+                            return Some(Arc::new(array));
+                        } else {
+                            return None;
+                        }
+                    }
+                    _ => None,
+                }
+            }
         }
     }};
 }