-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Minor: Fix incorrect indices for hashing struct #8775
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -214,22 +214,19 @@ fn hash_struct_array( | |
hashes_buffer: &mut [u64], | ||
) -> Result<()> { | ||
let nulls = array.nulls(); | ||
let num_columns = array.num_columns(); | ||
let row_len = array.len(); | ||
|
||
// Skip null columns | ||
let valid_indices: Vec<usize> = if let Some(nulls) = nulls { | ||
let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls { | ||
nulls.valid_indices().collect() | ||
} else { | ||
(0..num_columns).collect() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Main fix |
||
(0..row_len).collect() | ||
}; | ||
|
||
// Create hashes for each row that combines the hashes over all the column at that row. | ||
// array.len() is the number of rows. | ||
let mut values_hashes = vec![0u64; array.len()]; | ||
let mut values_hashes = vec![0u64; row_len]; | ||
create_hashes(array.columns(), random_state, &mut values_hashes)?; | ||
|
||
// Skip the null columns, nulls should get hash value 0. | ||
for i in valid_indices { | ||
for i in valid_row_indices { | ||
let hash = &mut hashes_buffer[i]; | ||
*hash = combine_hashes(*hash, values_hashes[i]); | ||
} | ||
|
@@ -601,6 +598,39 @@ mod tests { | |
assert_eq!(hashes[4], hashes[5]); | ||
} | ||
|
||
#[test] | ||
// Tests actual values of hashes, which are different if forcing collisions | ||
#[cfg(not(feature = "force_hash_collisions"))] | ||
fn create_hashes_for_struct_arrays_more_column_than_row() { | ||
let struct_array = StructArray::from(vec![ | ||
( | ||
Arc::new(Field::new("bool", DataType::Boolean, false)), | ||
Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef, | ||
), | ||
( | ||
Arc::new(Field::new("i32-1", DataType::Int32, false)), | ||
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, | ||
), | ||
( | ||
Arc::new(Field::new("i32-2", DataType::Int32, false)), | ||
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, | ||
), | ||
( | ||
Arc::new(Field::new("i32-3", DataType::Int32, false)), | ||
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, | ||
), | ||
]); | ||
|
||
assert!(struct_array.is_valid(0)); | ||
assert!(struct_array.is_valid(1)); | ||
|
||
let array = Arc::new(struct_array) as ArrayRef; | ||
let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
let mut hashes = vec![0; array.len()]; | ||
create_hashes(&[array], &random_state, &mut hashes).unwrap(); | ||
assert_eq!(hashes[0], hashes[1]); | ||
} | ||
|
||
#[test] | ||
// Tests actual values of hashes, which are different if forcing collisions | ||
#[cfg(not(feature = "force_hash_collisions"))] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -148,7 +148,7 @@ select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01 | |
---- | ||
10 | ||
|
||
query RRR | ||
query RRR rowsort | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rowsort is added because I got and groupby is not order sensitive
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah -- also added the same fix in #8769 and I agree this looks good |
||
select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00' group by type; | ||
---- | ||
100 600 350 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I forgot to correct the comment last time. Indices are all about row, not columns