diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index fa32ab6dc43e..7dd4472f58c9 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-select = { version = "29.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index c8a4d37cd7cc..2e0356e73dbe 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -21,6 +21,7 @@ use arrow_array::*; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::*; +use arrow_select::take::take; use regex::Regex; use std::collections::HashMap; @@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - like_dict_scalar(left, right) + let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -240,31 +244,6 @@ pub fn like_utf8_scalar( like_scalar(left, right) } -/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn like_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: /// /// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` @@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - nlike_dict_scalar(left, right) + let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar( nlike_scalar(left, right) } -/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nlike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - ilike_dict_scalar(left, right) + let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar( ilike_scalar(left, right) } -/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn ilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - nilike_dict_scalar(left, right) + let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar( nilike_scalar(left, right) } -/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - fn is_like_pattern(c: char) -> bool { c == '%' || c == '_' } diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 99229ed0b37b..7b3b935bcf3a 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$")) }); - let dict_arr_a = create_string_dict_array::(size, 0.0, 4); - let dict_arr_b = create_string_dict_array::(size, 0.0, 4); + let strings = create_string_array::(20, 0.); + let dict_arr_a = create_dict_from_values::(size, 0., &strings); + let dict_arr_b = create_dict_from_values::(size, 0., &strings); - c.bench_function("dict eq string", |b| { + c.bench_function("eq dictionary[10] string[4])", |b| { b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b)) }); + + c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| { + b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test")) + }); + + c.bench_function( + "gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])", + |b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")), + ); + + c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| { + b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test")) + }); + + c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| { + b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test")) + }); } criterion_group!(benches, add_benchmark);