-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add parquet page stats for float{16, 32, 64} #10982
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -614,6 +614,94 @@ async fn test_int_8() { | |
.run(); | ||
} | ||
|
||
#[tokio::test] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💯 very nice tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this test is mostly a duplicate of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tshauck this is one of the things that were bugging me a bit, the other one being https://github.com/apache/datafusion/pull/10982/files#diff-7110f4709c105a18ef74a212396444d62052179a735d148fb62470a8b157fb40R749-R763 -- both are very repetitive however, I didn't want to get overeager, only to realize later than the abstractions chosen was not the right one. Perhaps the best way forward would be to address #10952 first (which may also have its own "float16"-like tricky case), and then getting the correct macros for testing, index handling, etc. Speaking of that, would you like to take that one? I'd be happy to review then There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds like a good plan to me -- let's merge this PR and then work on #10952 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed #11000 to track duplication |
||
async fn test_float_16() { | ||
// This creates a parquet files of 1 column named f | ||
let reader = TestReader { | ||
scenario: Scenario::Float16, | ||
row_per_group: 5, | ||
} | ||
.build() | ||
.await; | ||
|
||
Test { | ||
reader: &reader, | ||
// mins are [-5, -4, 0, 5] | ||
expected_min: Arc::new(Float16Array::from(vec![ | ||
f16::from_f32(-5.), | ||
f16::from_f32(-4.), | ||
f16::from_f32(-0.), | ||
f16::from_f32(5.), | ||
])), | ||
// maxes are [-1, 0, 4, 9] | ||
expected_max: Arc::new(Float16Array::from(vec![ | ||
f16::from_f32(-1.), | ||
f16::from_f32(0.), | ||
f16::from_f32(4.), | ||
f16::from_f32(9.), | ||
])), | ||
// nulls are [0, 0, 0, 0] | ||
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), | ||
// row counts are [5, 5, 5, 5] | ||
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), | ||
column_name: "f", | ||
check: Check::Both, | ||
} | ||
.run(); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_float_32() { | ||
// This creates a parquet files of 1 column named f | ||
let reader = TestReader { | ||
scenario: Scenario::Float32, | ||
row_per_group: 5, | ||
} | ||
.build() | ||
.await; | ||
|
||
Test { | ||
reader: &reader, | ||
// mins are [-5, -4, 0, 5] | ||
expected_min: Arc::new(Float32Array::from(vec![-5., -4., -0., 5.0])), | ||
// maxes are [-1, 0, 4, 9] | ||
expected_max: Arc::new(Float32Array::from(vec![-1., 0., 4., 9.])), | ||
// nulls are [0, 0, 0, 0] | ||
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), | ||
// row counts are [5, 5, 5, 5] | ||
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), | ||
column_name: "f", | ||
check: Check::Both, | ||
} | ||
.run(); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_float_64() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a similar |
||
// This creates a parquet files of 1 column named f | ||
let reader = TestReader { | ||
scenario: Scenario::Float64, | ||
row_per_group: 5, | ||
} | ||
.build() | ||
.await; | ||
|
||
Test { | ||
reader: &reader, | ||
// mins are [-5, -4, 0, 5] | ||
expected_min: Arc::new(Float64Array::from(vec![-5., -4., -0., 5.0])), | ||
// maxes are [-1, 0, 4, 9] | ||
expected_max: Arc::new(Float64Array::from(vec![-1., 0., 4., 9.])), | ||
// nulls are [0, 0, 0, 0] | ||
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), | ||
// row counts are [5, 5, 5, 5] | ||
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), | ||
column_name: "f", | ||
check: Check::Both, | ||
} | ||
.run(); | ||
} | ||
|
||
// timestamp | ||
#[tokio::test] | ||
async fn test_timestamp() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cargo doc --document-private-items -p datafusion --open
I see what is going on -- I think this makes sense for now
Implies that the iterator is an iterator over Vec<FixedLengthItem
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
exactly, basically caused by there being no f16 index in https://arrow.apache.org/rust/src/parquet/file/page_index/index.rs.html#74 ... I was thinking of mapping to f16 first earlier on, to make it cleaner... but this turned out to be the simplest I came up with.
#10972 (comment) is I think the best way to make this better eventually