Skip to content

Commit

Permalink
Fix parquet pruning when column names have periods (apache#5710)
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb authored Mar 24, 2023
1 parent c825c84 commit 74c3955
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 2 deletions.
2 changes: 1 addition & 1 deletion datafusion/core/src/physical_optimizer/pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ fn build_statistics_record_batch<S: PruningStatistics>(
let mut arrays = Vec::<ArrayRef>::new();
// For each needed statistics column:
for (column, statistics_type, stat_field) in required_columns.iter() {
let column = Column::from_qualified_name(column.name());
let column = Column::from_name(column.name());
let data_type = stat_field.data_type();

let num_containers = statistics.num_containers();
Expand Down
41 changes: 40 additions & 1 deletion datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ enum Scenario {
Float64,
Decimal,
DecimalLargePrecision,
PeriodsInColumnNames,
}

enum Unit {
Expand Down Expand Up @@ -454,6 +455,25 @@ fn make_date_batch(offset: Duration) -> RecordBatch {
.unwrap()
}

/// returns a batch with two columns (note "service.name" is the name
/// of the column. It is *not* a table named service.name
///
/// name | service.name
fn make_names_batch(name: &str, service_name_values: Vec<&str>) -> RecordBatch {
let num_rows = service_name_values.len();
let name: StringArray = std::iter::repeat(Some(name)).take(num_rows).collect();
let service_name: StringArray = service_name_values.iter().map(Some).collect();

let schema = Schema::new(vec![
Field::new("name", name.data_type().clone(), true),
// note the column name has a period in it!
Field::new("service.name", service_name.data_type().clone(), true),
]);
let schema = Arc::new(schema);

RecordBatch::try_new(schema, vec![Arc::new(name), Arc::new(service_name)]).unwrap()
}

fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Timestamps => {
Expand Down Expand Up @@ -505,10 +525,29 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
make_decimal_batch(vec![2000, 3000, 3000, 4000, 6000], 38, 2),
]
}
Scenario::PeriodsInColumnNames => {
vec![
// all frontend
make_names_batch(
"HTTP GET / DISPATCH",
vec!["frontend", "frontend", "frontend", "frontend", "frontend"],
),
// both frontend and backend
make_names_batch(
"HTTP PUT / DISPATCH",
vec!["frontend", "frontend", "backend", "backend", "backend"],
),
// all backend
make_names_batch(
"HTTP GET / DISPATCH",
vec!["backend", "backend", "backend", "backend", "backend"],
),
]
}
}
}

/// Create a test parquet file with varioud data types
/// Create a test parquet file with various data types
async fn make_test_file_rg(scenario: Scenario) -> NamedTempFile {
let mut output_file = tempfile::Builder::new()
.prefix("parquet_pruning")
Expand Down
33 changes: 33 additions & 0 deletions datafusion/core/tests/parquet/row_group_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,36 @@ async fn prune_decimal_in_list() {
)
.await;
}

#[tokio::test]
async fn prune_periods_in_column_names() {
// There are three row groups for "service.name", each with 5 rows = 15 rows total
// name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'],
// name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'],
// name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ],
test_prune(
Scenario::PeriodsInColumnNames,
// use double quotes to use column named "service.name"
"SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend'",
Some(0),
Some(1), // prune out last row group
7,
)
.await;
test_prune(
Scenario::PeriodsInColumnNames,
"SELECT \"name\", \"service.name\" FROM t WHERE \"name\" != 'HTTP GET / DISPATCH'",
Some(0),
Some(2), // prune out first and last row group
5,
)
.await;
test_prune(
Scenario::PeriodsInColumnNames,
"SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend' AND \"name\" != 'HTTP GET / DISPATCH'",
Some(0),
Some(2), // prune out middle and last row group
2,
)
.await;
}

0 comments on commit 74c3955

Please sign in to comment.