Skip to content

Commit

Permalink
Use take for dictionary like comparisons (#3313)
Browse files Browse the repository at this point in the history
* Use take for like comparisons

* Fix benchmark name

* Format
  • Loading branch information
tustvold authored Dec 9, 2022
1 parent a92804e commit f078aed
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 107 deletions.
1 change: 1 addition & 0 deletions arrow-string/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" }
arrow-data = { version = "29.0.0", path = "../arrow-data" }
arrow-schema = { version = "29.0.0", path = "../arrow-schema" }
arrow-array = { version = "29.0.0", path = "../arrow-array" }
arrow-select = { version = "29.0.0", path = "../arrow-select" }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }

Expand Down
121 changes: 17 additions & 104 deletions arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use arrow_array::*;
use arrow_data::bit_mask::combine_option_bitmap;
use arrow_data::ArrayData;
use arrow_schema::*;
use arrow_select::take::take;
use regex::Regex;
use std::collections::HashMap;

Expand Down Expand Up @@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
like_dict_scalar(left, right)
let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -240,31 +244,6 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
like_scalar(left, right)
}

/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn like_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
like_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
like_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
Expand Down Expand Up @@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
nlike_dict_scalar(left, right)
let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nlike_scalar(left, right)
}

/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn nlike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
nlike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
nlike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
Expand Down Expand Up @@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
ilike_dict_scalar(left, right)
let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
ilike_scalar(left, right)
}

/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn ilike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
ilike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
ilike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
Expand Down Expand Up @@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
nilike_dict_scalar(left, right)
let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
// TODO: Use take_boolean (#2967)
let array = take(&dict_comparison, left.keys(), None)?;
Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
Expand All @@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nilike_scalar(left, right)
}

/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
/// See the documentation on [`like_utf8`] for more details.
fn nilike_dict_scalar<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
match left.value_type() {
DataType::Utf8 => {
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
nilike_scalar(left, right)
}
DataType::LargeUtf8 => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
nilike_scalar(left, right)
}
_ => {
Err(ArrowError::ComputeError(
"nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
))
}
}
}

fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}
Expand Down
24 changes: 21 additions & 3 deletions arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});

let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let strings = create_string_array::<i32>(20, 0.);
let dict_arr_a = create_dict_from_values::<Int32Type>(size, 0., &strings);
let dict_arr_b = create_dict_from_values::<Int32Type>(size, 0., &strings);

c.bench_function("dict eq string", |b| {
c.bench_function("eq dictionary[10] string[4])", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
});

c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| {
b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test"))
});

c.bench_function(
"gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])",
|b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")),
);

c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| {
b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test"))
});

c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| {
b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test"))
});
}

criterion_group!(benches, add_benchmark);
Expand Down

0 comments on commit f078aed

Please sign in to comment.