-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix parquet pruning when column names have periods #5710
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,6 +59,7 @@ enum Scenario { | |
Float64, | ||
Decimal, | ||
DecimalLargePrecision, | ||
PeriodsInColumnNames, | ||
} | ||
|
||
enum Unit { | ||
|
@@ -454,6 +455,25 @@ fn make_date_batch(offset: Duration) -> RecordBatch { | |
.unwrap() | ||
} | ||
|
||
/// returns a batch with two columns (note "service.name" is the name | ||
/// of the column. It is *not* a table named service.name | ||
/// | ||
/// name | service.name | ||
fn make_names_batch(name: &str, service_name_values: Vec<&str>) -> RecordBatch { | ||
let num_rows = service_name_values.len(); | ||
let name: StringArray = std::iter::repeat(Some(name)).take(num_rows).collect(); | ||
let service_name: StringArray = service_name_values.iter().map(Some).collect(); | ||
|
||
let schema = Schema::new(vec![ | ||
Field::new("name", name.data_type().clone(), true), | ||
// note the column name has a period in it! | ||
Field::new("service.name", service_name.data_type().clone(), true), | ||
Comment on lines
+469
to
+470
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
]); | ||
let schema = Arc::new(schema); | ||
|
||
RecordBatch::try_new(schema, vec![Arc::new(name), Arc::new(service_name)]).unwrap() | ||
} | ||
|
||
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> { | ||
match scenario { | ||
Scenario::Timestamps => { | ||
|
@@ -505,10 +525,29 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> { | |
make_decimal_batch(vec![2000, 3000, 3000, 4000, 6000], 38, 2), | ||
] | ||
} | ||
Scenario::PeriodsInColumnNames => { | ||
vec![ | ||
// all frontend | ||
make_names_batch( | ||
"HTTP GET / DISPATCH", | ||
vec!["frontend", "frontend", "frontend", "frontend", "frontend"], | ||
), | ||
// both frontend and backend | ||
make_names_batch( | ||
"HTTP PUT / DISPATCH", | ||
vec!["frontend", "frontend", "backend", "backend", "backend"], | ||
), | ||
// all backend | ||
make_names_batch( | ||
"HTTP GET / DISPATCH", | ||
vec!["backend", "backend", "backend", "backend", "backend"], | ||
), | ||
] | ||
} | ||
} | ||
} | ||
|
||
/// Create a test parquet file with varioud data types | ||
/// Create a test parquet file with various data types | ||
async fn make_test_file_rg(scenario: Scenario) -> NamedTempFile { | ||
let mut output_file = tempfile::Builder::new() | ||
.prefix("parquet_pruning") | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -483,3 +483,36 @@ async fn prune_decimal_in_list() { | |||||
) | ||||||
.await; | ||||||
} | ||||||
|
||||||
#[tokio::test] | ||||||
async fn prune_periods_in_column_names() { | ||||||
// There are three row groups for "service.name", each with 5 rows = 15 rows total | ||||||
// name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], | ||||||
// name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], | ||||||
// name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], | ||||||
test_prune( | ||||||
Scenario::PeriodsInColumnNames, | ||||||
// use double quotes to use column named "service.name" | ||||||
"SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend'", | ||||||
Some(0), | ||||||
Some(1), // prune out last row group | ||||||
7, | ||||||
) | ||||||
.await; | ||||||
test_prune( | ||||||
Scenario::PeriodsInColumnNames, | ||||||
"SELECT \"name\", \"service.name\" FROM t WHERE \"name\" != 'HTTP GET / DISPATCH'", | ||||||
Some(0), | ||||||
Some(2), // prune out first and last row group | ||||||
5, | ||||||
) | ||||||
.await; | ||||||
test_prune( | ||||||
Scenario::PeriodsInColumnNames, | ||||||
"SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend' AND \"name\" != 'HTTP GET / DISPATCH'", | ||||||
Some(0), | ||||||
Some(2), // prune out middle and last row group | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. super nit:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry -- missed that -- will fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Included in #5726 |
||||||
2, | ||||||
) | ||||||
.await; | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line is the bugfix
The rest of the PR is a test