-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Minor: Introduce utils::hash for StructArray #8552
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -27,7 +27,8 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array}; | |||||
use arrow_buffer::i256; | ||||||
|
||||||
use crate::cast::{ | ||||||
as_boolean_array, as_generic_binary_array, as_primitive_array, as_string_array, | ||||||
as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, | ||||||
as_primitive_array, as_string_array, as_struct_array, | ||||||
}; | ||||||
use crate::error::{DataFusionError, Result, _internal_err}; | ||||||
|
||||||
|
@@ -207,6 +208,35 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>( | |||||
Ok(()) | ||||||
} | ||||||
|
||||||
fn hash_struct_array( | ||||||
array: &StructArray, | ||||||
random_state: &RandomState, | ||||||
hashes_buffer: &mut [u64], | ||||||
) -> Result<()> { | ||||||
let nulls = array.nulls(); | ||||||
let num_columns = array.num_columns(); | ||||||
|
||||||
// Skip null columns | ||||||
let valid_indices: Vec<usize> = if let Some(nulls) = nulls { | ||||||
nulls.valid_indices().collect() | ||||||
} else { | ||||||
(0..num_columns).collect() | ||||||
}; | ||||||
|
||||||
// Create hashes for each row that combines the hashes over all the column at that row. | ||||||
// array.len() is the number of rows. | ||||||
let mut values_hashes = vec![0u64; array.len()]; | ||||||
create_hashes(array.columns(), random_state, &mut values_hashes)?; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should be able to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we do that, we should be able to remove the code related to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think so. values_hashes: [9258723240401068087, 9258723240401068087, 8502738074356456021, 8502738074356456021, 4222447303697976283, 9753707356376286577] We can see that |
||||||
|
||||||
// Skip the null columns, nulls should get hash value 0. | ||||||
for i in valid_indices { | ||||||
let hash = &mut hashes_buffer[i]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems not correct, as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Dandandan. Not fully got your point. I do consider each column as one value (iterate each row in one column with get one final hash
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I mistake how the null buffer works in StructArray. It seems to be row-wise, but I consider it as column-wise... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks pretty much better now :) |
||||||
*hash = combine_hashes(*hash, values_hashes[i]); | ||||||
} | ||||||
|
||||||
Ok(()) | ||||||
} | ||||||
|
||||||
fn hash_list_array<OffsetSize>( | ||||||
array: &GenericListArray<OffsetSize>, | ||||||
random_state: &RandomState, | ||||||
|
@@ -327,12 +357,16 @@ pub fn create_hashes<'a>( | |||||
array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, | ||||||
_ => unreachable!() | ||||||
} | ||||||
DataType::Struct(_) => { | ||||||
let array = as_struct_array(array)?; | ||||||
hash_struct_array(array, random_state, hashes_buffer)?; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should work?
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we consider nulls, this doesn't work |
||||||
} | ||||||
DataType::List(_) => { | ||||||
let array = as_list_array(array); | ||||||
let array = as_list_array(array)?; | ||||||
waynexia marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
hash_list_array(array, random_state, hashes_buffer)?; | ||||||
} | ||||||
DataType::LargeList(_) => { | ||||||
let array = as_large_list_array(array); | ||||||
let array = as_large_list_array(array)?; | ||||||
hash_list_array(array, random_state, hashes_buffer)?; | ||||||
} | ||||||
_ => { | ||||||
|
@@ -515,6 +549,58 @@ mod tests { | |||||
assert_eq!(hashes[2], hashes[3]); | ||||||
} | ||||||
|
||||||
#[test] | ||||||
// Tests actual values of hashes, which are different if forcing collisions | ||||||
#[cfg(not(feature = "force_hash_collisions"))] | ||||||
fn create_hashes_for_struct_arrays() { | ||||||
use arrow_buffer::Buffer; | ||||||
|
||||||
let boolarr = Arc::new(BooleanArray::from(vec![ | ||||||
false, false, true, true, true, true, | ||||||
])); | ||||||
let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31])); | ||||||
|
||||||
let struct_array = StructArray::from(( | ||||||
vec![ | ||||||
( | ||||||
Arc::new(Field::new("bool", DataType::Boolean, false)), | ||||||
boolarr.clone() as ArrayRef, | ||||||
), | ||||||
( | ||||||
Arc::new(Field::new("i32", DataType::Int32, false)), | ||||||
i32arr.clone() as ArrayRef, | ||||||
), | ||||||
( | ||||||
Arc::new(Field::new("i32", DataType::Int32, false)), | ||||||
i32arr.clone() as ArrayRef, | ||||||
), | ||||||
( | ||||||
Arc::new(Field::new("bool", DataType::Boolean, false)), | ||||||
boolarr.clone() as ArrayRef, | ||||||
), | ||||||
], | ||||||
Buffer::from(&[0b001011]), | ||||||
)); | ||||||
|
||||||
assert!(struct_array.is_valid(0)); | ||||||
assert!(struct_array.is_valid(1)); | ||||||
assert!(struct_array.is_null(2)); | ||||||
assert!(struct_array.is_valid(3)); | ||||||
assert!(struct_array.is_null(4)); | ||||||
assert!(struct_array.is_null(5)); | ||||||
|
||||||
let array = Arc::new(struct_array) as ArrayRef; | ||||||
|
||||||
let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||||||
let mut hashes = vec![0; array.len()]; | ||||||
create_hashes(&[array], &random_state, &mut hashes).unwrap(); | ||||||
assert_eq!(hashes[0], hashes[1]); | ||||||
// same value but the third row ( hashes[2] ) is null | ||||||
assert_ne!(hashes[2], hashes[3]); | ||||||
// different values but both are null | ||||||
assert_eq!(hashes[4], hashes[5]); | ||||||
} | ||||||
|
||||||
#[test] | ||||||
// Tests actual values of hashes, which are different if forcing collisions | ||||||
#[cfg(not(feature = "force_hash_collisions"))] | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't yet seem correct,
valid_indices
contains the rows that are valid, not the columns.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I hash each row now, so we iterate the valid row and hash each column at that row to one hash value.