From 736eb1109dc531c85c15088199bada85d12df0e4 Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 9 Sep 2024 21:49:57 +0800 Subject: [PATCH 01/20] draft. --- datafusion/functions/src/string/common.rs | 122 +++++++++++++++++---- datafusion/functions/src/string/rtrim.rs | 40 +++++++ datafusion/functions/src/unicode/substr.rs | 2 +- 3 files changed, 140 insertions(+), 24 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 9365a6d83331..e2b69b58ff01 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -27,11 +27,14 @@ use arrow::array::{ }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; +use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::ColumnarValue; +use crate::unicode::substr::make_and_append_view; + pub(crate) enum TrimType { Left, Right, @@ -83,16 +86,33 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( func: fn(&'a str, &'a str) -> &'a str, args: &'a [ArrayRef], ) -> Result { - let string_array = as_string_view_array(&args[0])?; + let string_view_array = as_string_view_array(&args[0])?; + let mut views_buf = Vec::with_capacity(string_view_array.len()); + let mut null_builder = NullBufferBuilder::new(string_view_array.len()); match args.len() { 1 => { - let result = string_array - .iter() - .map(|string| string.map(|string: &str| func(string, " "))) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) + for (idx, raw) in string_view_array.views().iter().enumerate() { + unsafe { + // Safety: + // idx is always smaller or equal to string_view_array.views.len() + let origin_str = string_view_array.value_unchecked(idx); + let trim_str = func(origin_str, " "); + + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; + + make_and_append_view( + &mut views_buf, + &mut null_builder, + raw, + trim_str, + start, + ); + } + } } 2 => { let characters_array = as_string_view_array(&args[1])?; @@ -102,35 +122,91 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( return Ok(new_null_array( // The schema is expecting utf8 as null &DataType::Utf8, - string_array.len(), + string_view_array.len(), )); } let characters = characters_array.value(0); - let result = string_array - .iter() - .map(|item| item.map(|string| func(string, characters))) - .collect::>(); - return Ok(Arc::new(result) as ArrayRef); + + for (idx, raw) in string_view_array.views().iter().enumerate() { + unsafe { + // Safety: + // idx is always smaller or equal to string_view_array.views.len() + let origin_str = string_view_array.value_unchecked(idx); + let trim_str = func(origin_str, characters); + + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = + trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; + + make_and_append_view( + &mut views_buf, + &mut null_builder, + raw, + trim_str, + start, + ); + } + } } - let result = string_array + for (idx, (raw, characters_opt)) in string_view_array + .views() .iter() .zip(characters_array.iter()) - .map(|(string, characters)| match (string, characters) { - (Some(string), Some(characters)) => Some(func(string, characters)), - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) + .enumerate() + { + if let Some(characters) = characters_opt { + unsafe { + // Safety: + // idx is always smaller or equal to string_view_array.views.len() + let origin_str = string_view_array.value_unchecked(idx); + let trim_str = func(origin_str, characters); + + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = + trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; + + make_and_append_view( + &mut views_buf, + &mut null_builder, + raw, + trim_str, + start, + ); + } + } else { + null_builder.append_null(); + views_buf.push(0); + } + } } other => { - exec_err!( + return exec_err!( "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2." - ) + ); } } + + let views_buf = ScalarBuffer::from(views_buf); + let nulls_buf = null_builder.finish(); + + // Safety: + // (1) The blocks of the given views are all provided + // (2) Each of the range `view.offset+start..end` of view in views_buf is within + // the bounds of each of the blocks + unsafe { + let array = StringViewArray::new_unchecked( + views_buf, + string_view_array.data_buffers().to_vec(), + nulls_buf, + ); + Ok(Arc::new(array) as ArrayRef) + } } fn string_trim<'a, T: OffsetSizeTrait>( diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index ec53f3ed7430..52d0826137fa 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -101,3 +101,43 @@ impl ScalarUDFImpl for RtrimFunc { } } } + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{Utf8, Utf8View}; + + use datafusion_common::{exec_err, Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::substr::SubstrFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() { + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ColumnarValue::Scalar(ScalarValue::from(1i64)), + ], + Ok(None), + &str, + Utf8View, + StringViewArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::from(0i64)), + ], + Ok(Some("alphabet")), + &str, + Utf8View, + StringViewArray + ); + } +} diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 40d3a4d13e97..1833971603ce 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -167,7 +167,7 @@ fn get_true_start_end(input: &str, start: i64, count: Option) -> (usize, us /// Make a `u128` based on the given substr, start(offset to view.offset), and /// push into to the given buffers -fn make_and_append_view( +pub fn make_and_append_view( views_buffer: &mut Vec, null_builder: &mut NullBufferBuilder, raw: &u128, From a0da2d0b31495ab9736935464b1287e4e12bfa6d Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 10 Sep 2024 00:08:23 +0800 Subject: [PATCH 02/20] add unit tests for xTrim. --- datafusion/functions/src/string/btrim.rs | 137 +++++++++++++++++++++- datafusion/functions/src/string/common.rs | 2 +- datafusion/functions/src/string/ltrim.rs | 137 +++++++++++++++++++++- datafusion/functions/src/string/rtrim.rs | 116 ++++++++++++++++-- 4 files changed, 379 insertions(+), 13 deletions(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 371a11c82c54..4dcfe0f3aca0 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -82,7 +82,11 @@ impl ScalarUDFImpl for BTrimFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "btrim") + if arg_types[0] == DataType::Utf8View { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "btrim") + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -106,3 +110,134 @@ impl ScalarUDFImpl for BTrimFunc { &self.aliases } } + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{Utf8View, Utf8}; + + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::string::btrim::BTrimFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() { + test_function!( + BTrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + BTrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from(" alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("t")))), + ], + Ok(Some("alphabe")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabe" + )))), + ], + Ok(Some("t")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ], + Ok(None), + &str, + Utf8View, + StringViewArray + ); + test_function!( + BTrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + BTrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("t")))), + ], + Ok(Some("alphabe")), + &str, + Utf8, + StringArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabe")))), + ], + Ok(Some("t")), + &str, + Utf8, + StringArray + ); + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + } +} \ No newline at end of file diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index e2b69b58ff01..dd40f785c153 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -121,7 +121,7 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( if characters_array.is_null(0) { return Ok(new_null_array( // The schema is expecting utf8 as null - &DataType::Utf8, + &DataType::Utf8View, string_view_array.len(), )); } diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index b7b27afcee1f..6e8482966122 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -81,7 +81,11 @@ impl ScalarUDFImpl for LtrimFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "ltrim") + if arg_types[0] == DataType::Utf8View { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "ltrim") + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -101,3 +105,134 @@ impl ScalarUDFImpl for LtrimFunc { } } } + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{Utf8View, Utf8}; + + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::string::ltrim::LtrimFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() { + test_function!( + LtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet ")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + LtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from(" alphabet ") + ))),], + Ok(Some("alphabet ")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("t")))), + ], + Ok(Some("alphabet")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabe" + )))), + ], + Ok(Some("t")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ], + Ok(None), + &str, + Utf8View, + StringViewArray + ); + test_function!( + LtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet ")), + &str, + Utf8, + StringArray + ); + test_function!( + LtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet ")), + &str, + Utf8, + StringArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("t")))), + ], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabe")))), + ], + Ok(Some("t")), + &str, + Utf8, + StringArray + ); + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); + } +} diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 52d0826137fa..7aeb12b99e28 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -81,7 +81,11 @@ impl ScalarUDFImpl for RtrimFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "rtrim") + if arg_types[0] == DataType::Utf8View { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "rtrim") + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -105,39 +109,131 @@ impl ScalarUDFImpl for RtrimFunc { #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; - use arrow::datatypes::DataType::{Utf8, Utf8View}; + use arrow::datatypes::DataType::{Utf8View, Utf8}; + - use datafusion_common::{exec_err, Result, ScalarValue}; + use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; - use crate::unicode::substr::SubstrFunc; + use crate::string::rtrim::RtrimFunc; use crate::utils::test::test_function; #[test] fn test_functions() { test_function!( - SubstrFunc::new(), + RtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + RtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + String::from(" alphabet ") + ))),], + Ok(Some(" alphabet")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + RtrimFunc::new(), &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(None)), - ColumnarValue::Scalar(ScalarValue::from(1i64)), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("t ")))), ], - Ok(None), + Ok(Some("alphabe")), &str, Utf8View, StringViewArray ); test_function!( - SubstrFunc::new(), + RtrimFunc::new(), &[ ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( "alphabet" )))), - ColumnarValue::Scalar(ScalarValue::from(0i64)), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabe" + )))), ], Ok(Some("alphabet")), &str, Utf8View, StringViewArray ); + test_function!( + RtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ], + Ok(None), + &str, + Utf8View, + StringViewArray + ); + + test_function!( + RtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("alphabet ") + ))),], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + RtrimFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from(" alphabet ") + ))),], + Ok(Some(" alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + RtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("t ")))), + ], + Ok(Some("alphabe")), + &str, + Utf8, + StringArray + ); + test_function!( + RtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabe")))), + ], + Ok(Some("alphabet")), + &str, + Utf8, + StringArray + ); + test_function!( + RtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ], + Ok(None), + &str, + Utf8, + StringArray + ); } } From 3c8b035bddad045f441d62cb142fc8162ce4484d Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 10 Sep 2024 01:01:36 +0800 Subject: [PATCH 03/20] fix fmt. --- datafusion/functions/src/string/btrim.rs | 5 ++--- datafusion/functions/src/string/ltrim.rs | 3 +-- datafusion/functions/src/string/rtrim.rs | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 4dcfe0f3aca0..05f9b227fb5d 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -114,8 +114,7 @@ impl ScalarUDFImpl for BTrimFunc { #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; - use arrow::datatypes::DataType::{Utf8View, Utf8}; - + use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -240,4 +239,4 @@ mod tests { StringArray ); } -} \ No newline at end of file +} diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 6e8482966122..123ecc33ce2f 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -109,8 +109,7 @@ impl ScalarUDFImpl for LtrimFunc { #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; - use arrow::datatypes::DataType::{Utf8View, Utf8}; - + use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 7aeb12b99e28..51df77e402b0 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -109,8 +109,7 @@ impl ScalarUDFImpl for RtrimFunc { #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; - use arrow::datatypes::DataType::{Utf8View, Utf8}; - + use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; From 06d104dbe54ea064c64fed1c053eb8b471fad729 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 10 Sep 2024 01:17:55 +0800 Subject: [PATCH 04/20] tmp copy for ci. --- datafusion/functions/src/string/common.rs | 28 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index dd40f785c153..3770fd5b96de 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -21,9 +21,9 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::{ - new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, - GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, - StringArray, StringBuilder, StringViewArray, StringViewBuilder, + make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, + ArrayRef, ByteView, GenericStringArray, GenericStringBuilder, LargeStringArray, + OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, StringViewBuilder, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; @@ -33,7 +33,27 @@ use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::ColumnarValue; -use crate::unicode::substr::make_and_append_view; +/// Make a `u128` based on the given substr, start(offset to view.offset), and +/// push into to the given buffers +// TODO: tmp copy, remove after #12383 is merged +fn make_and_append_view( + views_buffer: &mut Vec, + null_builder: &mut NullBufferBuilder, + raw: &u128, + substr: &str, + start: u32, +) { + let substr_len = substr.len(); + let sub_view = if substr_len > 12 { + let view = ByteView::from(*raw); + make_view(substr.as_bytes(), view.buffer_index, view.offset + start) + } else { + // inline value does not need block id or offset + make_view(substr.as_bytes(), 0, 0) + }; + views_buffer.push(sub_view); + null_builder.append_non_null(); +} pub(crate) enum TrimType { Left, From 48cb4db7902a84ab42dd75e13e84f1a0e47a5d18 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 11 Sep 2024 09:37:09 +0800 Subject: [PATCH 05/20] move `make_and_append_view` to common. --- datafusion/functions/src/string/common.rs | 3 +-- datafusion/functions/src/unicode/substr.rs | 26 +++------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 3770fd5b96de..2585822d14ba 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -35,8 +35,7 @@ use datafusion_expr::ColumnarValue; /// Make a `u128` based on the given substr, start(offset to view.offset), and /// push into to the given buffers -// TODO: tmp copy, remove after #12383 is merged -fn make_and_append_view( +pub(crate) fn make_and_append_view( views_buffer: &mut Vec, null_builder: &mut NullBufferBuilder, raw: &u128, diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 1833971603ce..8376b15a13f4 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -19,10 +19,11 @@ use std::any::Any; use std::cmp::max; use std::sync::Arc; +use crate::string::common::make_and_append_view; use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ - make_view, Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, ByteView, - GenericStringArray, OffsetSizeTrait, StringViewArray, + Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, + OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; @@ -165,27 +166,6 @@ fn get_true_start_end(input: &str, start: i64, count: Option) -> (usize, us (st, ed) } -/// Make a `u128` based on the given substr, start(offset to view.offset), and -/// push into to the given buffers -pub fn make_and_append_view( - views_buffer: &mut Vec, - null_builder: &mut NullBufferBuilder, - raw: &u128, - substr: &str, - start: u32, -) { - let substr_len = substr.len(); - let sub_view = if substr_len > 12 { - let view = ByteView::from(*raw); - make_view(substr.as_bytes(), view.buffer_index, view.offset + start) - } else { - // inline value does not need block id or offset - make_view(substr.as_bytes(), 0, 0) - }; - views_buffer.push(sub_view); - null_builder.append_non_null(); -} - // The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView fn string_view_substr( From 863e9b77ff035f6f8cbfbbcfbdf02a21e6ca1a52 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 11 Sep 2024 21:08:51 +0800 Subject: [PATCH 06/20] fix sting view trim about the process of empty string. --- datafusion/functions/src/string/common.rs | 132 +++++++++++----------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 2585822d14ba..11fb1c685996 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -38,13 +38,13 @@ use datafusion_expr::ColumnarValue; pub(crate) fn make_and_append_view( views_buffer: &mut Vec, null_builder: &mut NullBufferBuilder, - raw: &u128, + raw_view: &u128, substr: &str, start: u32, ) { let substr_len = substr.len(); let sub_view = if substr_len > 12 { - let view = ByteView::from(*raw); + let view = ByteView::from(*raw_view); make_view(substr.as_bytes(), view.buffer_index, view.offset + start) } else { // inline value does not need block id or offset @@ -101,8 +101,8 @@ pub(crate) fn general_trim( } // removing 'a will cause compiler complaining lifetime of `func` -fn string_view_trim<'a, T: OffsetSizeTrait>( - func: fn(&'a str, &'a str) -> &'a str, +fn string_view_trim<'a>( + trim_func: fn(&'a str, &'a str) -> &'a str, args: &'a [ArrayRef], ) -> Result { let string_view_array = as_string_view_array(&args[0])?; @@ -111,25 +111,28 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( match args.len() { 1 => { - for (idx, raw) in string_view_array.views().iter().enumerate() { - unsafe { - // Safety: - // idx is always smaller or equal to string_view_array.views.len() - let origin_str = string_view_array.value_unchecked(idx); - let trim_str = func(origin_str, " "); + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for (src_str_opt, raw_view) in array_iter.zip(views_iter) { + if let Some(src_str) = src_str_opt { + let trim_str = trim_func(src_str, " "); // Safety: // `trim_str` is computed from `str::trim_xxx_matches`, // and its addr is ensured to be >= `origin_str`'s - let start = trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; + let start = + unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; make_and_append_view( &mut views_buf, &mut null_builder, - raw, + raw_view, trim_str, start, ); + } else { + null_builder.append_null(); + views_buf.push(0); } } } @@ -137,6 +140,7 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( let characters_array = as_string_view_array(&args[1])?; if characters_array.len() == 1 { + // Only one `trim characters` exist if characters_array.is_null(0) { return Ok(new_null_array( // The schema is expecting utf8 as null @@ -146,61 +150,34 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( } let characters = characters_array.value(0); - - for (idx, raw) in string_view_array.views().iter().enumerate() { - unsafe { - // Safety: - // idx is always smaller or equal to string_view_array.views.len() - let origin_str = string_view_array.value_unchecked(idx); - let trim_str = func(origin_str, characters); - - // Safety: - // `trim_str` is computed from `str::trim_xxx_matches`, - // and its addr is ensured to be >= `origin_str`'s - let start = - trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; - - make_and_append_view( - &mut views_buf, - &mut null_builder, - raw, - trim_str, - start, - ); - } + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for (src_str_opt, raw_view) in array_iter.zip(views_iter) { + trim_and_append_str( + src_str_opt, + Some(characters), + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); } - } - - for (idx, (raw, characters_opt)) in string_view_array - .views() - .iter() - .zip(characters_array.iter()) - .enumerate() - { - if let Some(characters) = characters_opt { - unsafe { - // Safety: - // idx is always smaller or equal to string_view_array.views.len() - let origin_str = string_view_array.value_unchecked(idx); - let trim_str = func(origin_str, characters); - - // Safety: - // `trim_str` is computed from `str::trim_xxx_matches`, - // and its addr is ensured to be >= `origin_str`'s - let start = - trim_str.as_ptr().offset_from(origin_str.as_ptr()) as u32; - - make_and_append_view( - &mut views_buf, - &mut null_builder, - raw, - trim_str, - start, - ); - } - } else { - null_builder.append_null(); - views_buf.push(0); + } else { + // A specific `trim characters` for a row in the string view array + let characters_iter = characters_array.iter(); + let array_iter = string_view_array.iter(); + let views_iter = string_view_array.views().iter(); + for ((src_str_opt, raw_view), characters_opt) in + array_iter.zip(views_iter).zip(characters_iter) + { + trim_and_append_str( + src_str_opt, + characters_opt, + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); } } } @@ -228,6 +205,29 @@ fn string_view_trim<'a, T: OffsetSizeTrait>( } } +fn trim_and_append_str<'a>( + src_str_opt: Option<&'a str>, + trim_characters_opt: Option<&'a str>, + trim_func: fn(&'a str, &'a str) -> &'a str, + views_buf: &mut Vec, + null_builder: &mut NullBufferBuilder, + raw: &u128, +) { + if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { + let trim_str = trim_func(src_str, characters); + + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; + + make_and_append_view(views_buf, null_builder, raw, trim_str, start); + } else { + null_builder.append_null(); + views_buf.push(0); + } +} + fn string_trim<'a, T: OffsetSizeTrait>( func: fn(&'a str, &'a str) -> &'a str, args: &'a [ArrayRef], From 36a812512f34a54bc8536c51ef7ae183a3bb5c85 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 11 Sep 2024 21:12:09 +0800 Subject: [PATCH 07/20] fix compile. --- datafusion/functions/src/string/common.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 11fb1c685996..e84da48dc5b2 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -94,7 +94,7 @@ pub(crate) fn general_trim( }; if use_string_view { - string_view_trim::(func, args) + string_view_trim(func, args) } else { string_trim::(func, args) } From aa2c131ad319d21870c202b3996f0768c7cf2319 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 11 Sep 2024 21:21:58 +0800 Subject: [PATCH 08/20] eliminate some repeated codes. --- datafusion/functions/src/string/common.rs | 28 +++++++---------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index e84da48dc5b2..4f70374b74e3 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -114,26 +114,14 @@ fn string_view_trim<'a>( let array_iter = string_view_array.iter(); let views_iter = string_view_array.views().iter(); for (src_str_opt, raw_view) in array_iter.zip(views_iter) { - if let Some(src_str) = src_str_opt { - let trim_str = trim_func(src_str, " "); - - // Safety: - // `trim_str` is computed from `str::trim_xxx_matches`, - // and its addr is ensured to be >= `origin_str`'s - let start = - unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; - - make_and_append_view( - &mut views_buf, - &mut null_builder, - raw_view, - trim_str, - start, - ); - } else { - null_builder.append_null(); - views_buf.push(0); - } + trim_and_append_str( + src_str_opt, + Some(" "), + trim_func, + &mut views_buf, + &mut null_builder, + raw_view, + ); } } 2 => { From e3e9b5307c96d054d2405ebf531b678e82c0796e Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 17 Sep 2024 20:26:10 +0800 Subject: [PATCH 09/20] add sql test case about string view trim. --- .../sqllogictest/test_files/string_view.slt | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 171b8ec6c1d1..413baaac6f01 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -664,6 +664,19 @@ Xiangpeng Xiangpeng Xiangpeng NULL Raphael Raphael Raphael NULL NULL NULL NULL NULL +query TTTT +SELECT + arrow_typeof(BTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(BTRIM(column1_utf8view, 'A')) AS t2, + arrow_typeof(BTRIM(column1_utf8view)) AS t3, + arrow_typeof(BTRIM(column1_utf8view, NULL)) AS t4 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View + ## Ensure no casts for LTRIM # Test LTRIM with Utf8View input query TT @@ -710,6 +723,20 @@ Xiangpeng (empty) Xiangpeng NULL peng Raphael aphael Raphael NULL Raphael NULL NULL NULL NULL NULL +query TTTTT +SELECT + arrow_typeof(LTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(LTRIM(column1_utf8view, column2_utf8view)) AS t2, + arrow_typeof(LTRIM(column1_utf8view)) AS t3, + arrow_typeof(LTRIM(column1_utf8view, NULL)) AS t4, + arrow_typeof(LTRIM(column1_utf8view, 'Xiang')) AS t5 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View + ## ensure no casts for RTRIM # Test RTRIM with Utf8View input query TT @@ -756,6 +783,19 @@ Xiangpeng (empty) Xiangpeng NULL Xia Raphael Raphael Raphael NULL Raphael NULL NULL NULL NULL NULL +query TTTTT +SELECT + arrow_typeof(RTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(RTRIM(column1_utf8view, column2_utf8view)) AS t2, + arrow_typeof(RTRIM(column1_utf8view)) AS t3, + arrow_typeof(RTRIM(column1_utf8view, NULL)) AS t4, + arrow_typeof(RTRIM(column1_utf8view, 'peng')) As t5 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View ## Ensure no casts for CHARACTER_LENGTH query TT From 6d5660ffbd895304fa52868ec715006664fbfcf7 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 03:07:13 +0800 Subject: [PATCH 10/20] remove unused imports. --- datafusion/functions/src/unicode/substr.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index fe1af53452b0..934fa4bfe135 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -21,8 +21,8 @@ use std::sync::Arc; use crate::string::common::{make_and_append_view, StringArrayType}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ - make_view, Array, ArrayIter, ArrayRef, AsArray, ByteView, GenericStringArray, - Int64Array, OffsetSizeTrait, StringViewArray, + Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait, + StringViewArray, }; use arrow::datatypes::DataType; use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; From 522c87f771789a75589b0deb4e708c5f38178706 Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 23 Sep 2024 02:44:02 +0800 Subject: [PATCH 11/20] fix tests. --- .../test_files/string/string_query.slt.part | 41 --------- .../test_files/string/string_view.slt | 88 +++++++++++++++++++ 2 files changed, 88 insertions(+), 41 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 8f9e123c1fba..f22edae01de9 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -375,19 +375,6 @@ Xiangpeng Xiangpeng NULL datafusion数据融合 datafusion数据融合 NULL Raphael Raphael NULL datafusionДатаФусион datafusionДатаФусион NULL NULL NULL NULL NULL NULL NULL -query TTTT -SELECT - arrow_typeof(BTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(BTRIM(column1_utf8view, 'A')) AS t2, - arrow_typeof(BTRIM(column1_utf8view)) AS t3, - arrow_typeof(BTRIM(column1_utf8view, NULL)) AS t4 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View - # -------------------------------------- # Test LTRIM # -------------------------------------- @@ -408,20 +395,6 @@ Xiangpeng (empty) NULL datafusion数据融合 NULL datafusion数据融合 Raphael aphael NULL datafusionДатаФусион NULL datafusionДатаФусион NULL NULL NULL NULL NULL NULL -query TTTTT -SELECT - arrow_typeof(LTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(LTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(LTRIM(column1_utf8view)) AS t3, - arrow_typeof(LTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(LTRIM(column1_utf8view, 'Xiang')) AS t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - # -------------------------------------- # Test RTRIM # -------------------------------------- @@ -441,20 +414,6 @@ Xiangpeng (empty) Xiangpeng NULL datafusion数据融合 Raphael Raphael Raphael NULL datafusionДатаФусион NULL NULL NULL NULL NULL -query TTTTT -SELECT - arrow_typeof(RTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(RTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(RTRIM(column1_utf8view)) AS t3, - arrow_typeof(RTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(RTRIM(column1_utf8view, 'peng')) As t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - # -------------------------------------- # Test CONTAINS # -------------------------------------- diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index da7d99cc311b..d9413fa99884 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -998,5 +998,93 @@ Xiangpengfoo fooXiangpeng Raphaelfoo fooRaphael NULL NULL +################################################ +# Test for string view trim +################################################ +# Test BTRIM outputs +query TTTT +SELECT + BTRIM(column1_utf8view, 'foo') AS l1, + BTRIM(column1_utf8view, 'A') AS l2, + BTRIM(column1_utf8view) AS l3, + BTRIM(column1_utf8view, NULL) AS l4 +FROM test; +---- +Andrew ndrew Andrew NULL +Xiangpeng Xiangpeng Xiangpeng NULL +Raphael Raphael Raphael NULL +NULL NULL NULL NULL + +query TTTT +SELECT + arrow_typeof(BTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(BTRIM(column1_utf8view, 'A')) AS t2, + arrow_typeof(BTRIM(column1_utf8view)) AS t3, + arrow_typeof(BTRIM(column1_utf8view, NULL)) AS t4 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View + +# Test LTRIM outputs +query TTTTT +SELECT + LTRIM(column1_utf8view, 'foo') AS l1, + LTRIM(column1_utf8view, column2_utf8view) AS l2, + LTRIM(column1_utf8view) AS l3, + LTRIM(column1_utf8view, NULL) AS l4, + LTRIM(column1_utf8view, 'Xiang') AS l5 +FROM test; +---- +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL peng +Raphael aphael Raphael NULL Raphael +NULL NULL NULL NULL NULL + +query TTTTT +SELECT + arrow_typeof(LTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(LTRIM(column1_utf8view, column2_utf8view)) AS t2, + arrow_typeof(LTRIM(column1_utf8view)) AS t3, + arrow_typeof(LTRIM(column1_utf8view, NULL)) AS t4, + arrow_typeof(LTRIM(column1_utf8view, 'Xiang')) AS t5 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View + +# Test RTRIM outputs +query TTTTT +SELECT + RTRIM(column1_utf8view, 'foo') AS l1, + RTRIM(column1_utf8view, column2_utf8view) AS l2, + RTRIM(column1_utf8view) AS l3, + RTRIM(column1_utf8view, NULL) AS l4, + RTRIM(column1_utf8view, 'peng') As l5 +FROM test; +---- +Andrew Andrew Andrew NULL Andrew +Xiangpeng (empty) Xiangpeng NULL Xia +Raphael Raphael Raphael NULL Raphael +NULL NULL NULL NULL NULL + +query TTTTT +SELECT + arrow_typeof(RTRIM(column1_utf8view, 'foo')) AS t1, + arrow_typeof(RTRIM(column1_utf8view, column2_utf8view)) AS t2, + arrow_typeof(RTRIM(column1_utf8view)) AS t3, + arrow_typeof(RTRIM(column1_utf8view, NULL)) AS t4, + arrow_typeof(RTRIM(column1_utf8view, 'peng')) As t5 +FROM test; +---- +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View +Utf8View Utf8View Utf8View Utf8View Utf8View + statement ok drop table test From 840ec46ef00d2ffb2accd3977c768d0059f6b5ca Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 23 Sep 2024 02:46:00 +0800 Subject: [PATCH 12/20] remove stale file. --- .../sqllogictest/test_files/string_view.slt | 1133 ----------------- 1 file changed, 1133 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/string_view.slt diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt deleted file mode 100644 index 7ac4d8fb7bb8..000000000000 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ /dev/null @@ -1,1133 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -######## -## Test setup -######## - -statement ok -create table test_source as values - ('Andrew', 'X'), - ('Xiangpeng', 'Xiangpeng'), - ('Raphael', 'R'), - (NULL, 'R'); - -# Table with the different combination of column types -statement ok -create table test as -SELECT - arrow_cast(column1, 'Utf8') as column1_utf8, - arrow_cast(column2, 'Utf8') as column2_utf8, - arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, - arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, - arrow_cast(column1, 'Utf8View') as column1_utf8view, - arrow_cast(column2, 'Utf8View') as column2_utf8view, - arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict, - arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2_dict -FROM test_source; - -statement ok -drop table test_source - -statement ok -set datafusion.explain.logical_plan_only = true; - -# Ensure string functions use native StringView implementation -# and do not fall back to Utf8 or LargeUtf8 -# Should see no casts to Utf8 in the plans below - -## Ensure no casts for LIKE/ILIKE -query TT -EXPLAIN SELECT - column1_utf8view like 'foo' as "like", - column1_utf8view ilike 'foo' as "ilike" -FROM test; ----- -logical_plan -01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for SUBSTR - -query TT -EXPLAIN SELECT - SUBSTR(column1_utf8view, 1, 3) as c1, - SUBSTR(column2_utf8, 1, 3) as c2, - SUBSTR(column2_large_utf8, 1, 3) as c3 -FROM test; ----- -logical_plan -01)Projection: substr(test.column1_utf8view, Int64(1), Int64(3)) AS c1, substr(test.column2_utf8, Int64(1), Int64(3)) AS c2, substr(test.column2_large_utf8, Int64(1), Int64(3)) AS c3 -02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view] - -query TTT -SELECT - SUBSTR(column1_utf8view, 1, 3) as c1, - SUBSTR(column2_utf8, 1, 3) as c2, - SUBSTR(column2_large_utf8, 1, 3) as c3 -FROM test; ----- -And X X -Xia Xia Xia -Rap R R -NULL R R - -## Ensure no casts for ASCII - -### ASCII - -# Test ASCII with utf8view against utf8view, utf8, and largeutf8 -# (should be no casts) -query TT -EXPLAIN SELECT - ASCII(column1_utf8view) as c1, - ASCII(column2_utf8) as c2, - ASCII(column2_large_utf8) as c3 -FROM test; ----- -logical_plan -01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3 -02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view] - -query III -SELECT - ASCII(column1_utf8view) as c1, - ASCII(column2_utf8) as c2, - ASCII(column2_large_utf8) as c3 -FROM test; ----- -65 88 88 -88 88 88 -82 82 82 -NULL 82 82 - -query TT -EXPLAIN SELECT - ASCII(column1_utf8) as c1, - ASCII(column1_large_utf8) as c2, - ASCII(column2_utf8view) as c3, - ASCII('hello') as c4, - ASCII(arrow_cast('world', 'Utf8View')) as c5 -FROM test; ----- -logical_plan -01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5 -02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view] - -query IIIII -SELECT - ASCII(column1_utf8) as c1, - ASCII(column1_large_utf8) as c2, - ASCII(column2_utf8view) as c3, - ASCII('hello') as c4, - ASCII(arrow_cast('world', 'Utf8View')) as c5 -FROM test; ----- -65 65 88 104 119 -88 88 88 104 119 -82 82 82 104 119 -NULL NULL 82 104 119 - -# Test ASCII with literals cast to Utf8View -query TT -EXPLAIN SELECT - ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, - ASCII(arrow_cast('', 'Utf8View')) as c2, - ASCII(arrow_cast(NULL, 'Utf8View')) as c3 -FROM test; ----- -logical_plan -01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3 -02)--TableScan: test projection=[] - -query III -SELECT - ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, - ASCII(arrow_cast('', 'Utf8View')) as c2, - ASCII(arrow_cast(NULL, 'Utf8View')) as c3 ----- -228 0 NULL - -## Ensure no casts for BTRIM -# Test BTRIM with Utf8View input -query TT -EXPLAIN SELECT - BTRIM(column1_utf8view) AS l -FROM test; ----- -logical_plan -01)Projection: btrim(test.column1_utf8view) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test BTRIM with Utf8View input and Utf8View pattern -query TT -EXPLAIN SELECT - BTRIM(column1_utf8view, 'foo') AS l -FROM test; ----- -logical_plan -01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test BTRIM with Utf8View bytes longer than 12 -query TT -EXPLAIN SELECT - BTRIM(column1_utf8view, 'this is longer than 12') AS l -FROM test; ----- -logical_plan -01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test BTRIM outputs -query TTTT -SELECT - BTRIM(column1_utf8view, 'foo') AS l1, - BTRIM(column1_utf8view, 'A') AS l2, - BTRIM(column1_utf8view) AS l3, - BTRIM(column1_utf8view, NULL) AS l4 -FROM test; ----- -Andrew ndrew Andrew NULL -Xiangpeng Xiangpeng Xiangpeng NULL -Raphael Raphael Raphael NULL -NULL NULL NULL NULL - -query TTTT -SELECT - arrow_typeof(BTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(BTRIM(column1_utf8view, 'A')) AS t2, - arrow_typeof(BTRIM(column1_utf8view)) AS t3, - arrow_typeof(BTRIM(column1_utf8view, NULL)) AS t4 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View - -## Ensure no casts for LTRIM -# Test LTRIM with Utf8View input -query TT -EXPLAIN SELECT - LTRIM(column1_utf8view) AS l -FROM test; ----- -logical_plan -01)Projection: ltrim(test.column1_utf8view) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test LTRIM with Utf8View input and Utf8View pattern -query TT -EXPLAIN SELECT - LTRIM(column1_utf8view, 'foo') AS l -FROM test; ----- -logical_plan -01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test LTRIM with Utf8View bytes longer than 12 -query TT -EXPLAIN SELECT - LTRIM(column1_utf8view, 'this is longer than 12') AS l -FROM test; ----- -logical_plan -01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test LTRIM outputs -query TTTTT -SELECT - LTRIM(column1_utf8view, 'foo') AS l1, - LTRIM(column1_utf8view, column2_utf8view) AS l2, - LTRIM(column1_utf8view) AS l3, - LTRIM(column1_utf8view, NULL) AS l4, - LTRIM(column1_utf8view, 'Xiang') AS l5 -FROM test; ----- -Andrew Andrew Andrew NULL Andrew -Xiangpeng (empty) Xiangpeng NULL peng -Raphael aphael Raphael NULL Raphael -NULL NULL NULL NULL NULL - -query TTTTT -SELECT - arrow_typeof(LTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(LTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(LTRIM(column1_utf8view)) AS t3, - arrow_typeof(LTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(LTRIM(column1_utf8view, 'Xiang')) AS t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - -## ensure no casts for RTRIM -# Test RTRIM with Utf8View input -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view) AS l -FROM test; ----- -logical_plan -01)Projection: rtrim(test.column1_utf8view) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test RTRIM with Utf8View input and Utf8View pattern -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view, 'foo') AS l -FROM test; ----- -logical_plan -01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test RTRIM with Utf8View bytes longer than 12 -query TT -EXPLAIN SELECT - RTRIM(column1_utf8view, 'this is longer than 12') AS l -FROM test; ----- -logical_plan -01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l -02)--TableScan: test projection=[column1_utf8view] - -# Test RTRIM outputs -query TTTTT -SELECT - RTRIM(column1_utf8view, 'foo') AS l1, - RTRIM(column1_utf8view, column2_utf8view) AS l2, - RTRIM(column1_utf8view) AS l3, - RTRIM(column1_utf8view, NULL) AS l4, - RTRIM(column1_utf8view, 'peng') As l5 -FROM test; ----- -Andrew Andrew Andrew NULL Andrew -Xiangpeng (empty) Xiangpeng NULL Xia -Raphael Raphael Raphael NULL Raphael -NULL NULL NULL NULL NULL - -query TTTTT -SELECT - arrow_typeof(RTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(RTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(RTRIM(column1_utf8view)) AS t3, - arrow_typeof(RTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(RTRIM(column1_utf8view, 'peng')) As t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - -## Ensure no casts for CHARACTER_LENGTH -query TT -EXPLAIN SELECT - CHARACTER_LENGTH(column1_utf8view) AS l -FROM test; ----- -logical_plan -01)Projection: character_length(test.column1_utf8view) AS l -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for CONCAT Utf8View -query TT -EXPLAIN SELECT - concat(column1_utf8view, column2_utf8view) as c -FROM test; ----- -logical_plan -01)Projection: concat(test.column1_utf8view, test.column2_utf8view) AS c -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for CONCAT LargeUtf8 -query TT -EXPLAIN SELECT - concat(column1_large_utf8, column2_large_utf8) as c -FROM test; ----- -logical_plan -01)Projection: concat(test.column1_large_utf8, test.column2_large_utf8) AS c -02)--TableScan: test projection=[column1_large_utf8, column2_large_utf8] - -## Ensure no casts for CONCAT_WS -## TODO https://github.com/apache/datafusion/issues/11837 -query TT -EXPLAIN SELECT - concat_ws(', ', column1_utf8view, column2_utf8view) as c -FROM test; ----- -logical_plan -01)Projection: concat_ws(Utf8(", "), test.column1_utf8view, test.column2_utf8view) AS c -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for CONTAINS -query TT -EXPLAIN SELECT - CONTAINS(column1_utf8view, 'foo') as c1, - CONTAINS(column1_utf8view, column2_utf8view) as c2, - CONTAINS(column1_utf8view, column2_large_utf8) as c3, - CONTAINS(column1_utf8, column2_utf8view) as c4, - CONTAINS(column1_utf8, column2_utf8) as c5, - CONTAINS(column1_utf8, column2_large_utf8) as c6, - CONTAINS(column1_large_utf8, column1_utf8view) as c7, - CONTAINS(column1_large_utf8, column2_utf8) as c8, - CONTAINS(column1_large_utf8, column2_large_utf8) as c9 -FROM test; ----- -logical_plan -01)Projection: contains(test.column1_utf8view, Utf8("foo")) AS c1, contains(test.column1_utf8view, test.column2_utf8view) AS c2, contains(test.column1_utf8view, test.column2_large_utf8) AS c3, contains(test.column1_utf8, test.column2_utf8view) AS c4, contains(test.column1_utf8, test.column2_utf8) AS c5, contains(test.column1_utf8, test.column2_large_utf8) AS c6, contains(test.column1_large_utf8, test.column1_utf8view) AS c7, contains(test.column1_large_utf8, test.column2_utf8) AS c8, contains(test.column1_large_utf8, test.column2_large_utf8) AS c9 -02)--TableScan: test projection=[column1_utf8, column2_utf8, column1_large_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] - -query BBBBBBBBB -SELECT - CONTAINS(column1_utf8view, 'foo') as c1, - CONTAINS(column1_utf8view, column2_utf8view) as c2, - CONTAINS(column1_utf8view, column2_large_utf8) as c3, - CONTAINS(column1_utf8, column2_utf8view) as c4, - CONTAINS(column1_utf8, column2_utf8) as c5, - CONTAINS(column1_utf8, column2_large_utf8) as c6, - CONTAINS(column1_large_utf8, column1_utf8view) as c7, - CONTAINS(column1_large_utf8, column2_utf8) as c8, - CONTAINS(column1_large_utf8, column2_large_utf8) as c9 -FROM test; ----- -false false false false false false true false false -false true true true true true true true true -false true true true true true true true true -NULL NULL NULL NULL NULL NULL NULL NULL NULL - -## Ensure no casts for ENDS_WITH -query TT -EXPLAIN SELECT - ENDS_WITH(column1_utf8view, 'foo') as c1, - ENDS_WITH(column2_utf8view, column2_utf8view) as c2 -FROM test; ----- -logical_plan -01)Projection: ends_with(test.column1_utf8view, Utf8View("foo")) AS c1, ends_with(test.column2_utf8view, test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for LEVENSHTEIN -query TT -EXPLAIN SELECT - levenshtein(column1_utf8view, 'foo') as c1, - levenshtein(column1_utf8view, column2_utf8view) as c2 -FROM test; ----- -logical_plan -01)Projection: levenshtein(test.column1_utf8view, Utf8View("foo")) AS c1, levenshtein(test.column1_utf8view, test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for LOWER -query TT -EXPLAIN SELECT - LOWER(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: lower(test.column1_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query T -SELECT LOWER(column1_utf8view) as c1 -FROM test; ----- -andrew -xiangpeng -raphael -NULL - -## Ensure no casts for UPPER -query TT -EXPLAIN SELECT - UPPER(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: upper(test.column1_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query T -SELECT UPPER(column1_utf8view) as c1 -FROM test; ----- -ANDREW -XIANGPENG -RAPHAEL -NULL - -## Should run CONCAT successfully with utf8view -query T -SELECT - concat(column1_utf8view, column2_utf8view) as c -FROM test; ----- -AndrewX -XiangpengXiangpeng -RaphaelR -R - -## Should run CONCAT successfully with utf8 -query T -SELECT - concat(column1_utf8, column2_utf8) as c -FROM test; ----- -AndrewX -XiangpengXiangpeng -RaphaelR -R - -## Should run CONCAT successfully with utf8 and utf8view -query T -SELECT - concat(column1_utf8view, column2_utf8) as c -FROM test; ----- -AndrewX -XiangpengXiangpeng -RaphaelR -R - -## Should run CONCAT successfully with utf8 utf8view and largeutf8 -query T -SELECT - concat(column1_utf8view, column2_utf8, column2_large_utf8) as c -FROM test; ----- -AndrewXX -XiangpengXiangpengXiangpeng -RaphaelRR -RR - -## Should run CONCAT successfully with utf8large -query T -SELECT - concat(column1_large_utf8, column2_large_utf8) as c -FROM test; ----- -AndrewX -XiangpengXiangpeng -RaphaelR -R - -## Should run CONCAT successfully with utf8view -query T -SELECT - concat(column1_utf8view, column2_utf8view) as c -FROM test; ----- -AndrewX -XiangpengXiangpeng -RaphaelR -R - -## Should run CONCAT_WS successfully with utf8 -query T -SELECT - concat_ws(',', column1_utf8, column2_utf8) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Should run CONCAT_WS successfully with utf8view -query T -SELECT - concat_ws(',', column1_utf8view, column2_utf8view) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Should run CONCAT_WS successfully with largeutf8 -query T -SELECT - concat_ws(',', column1_large_utf8, column2_large_utf8) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Should run CONCAT_WS successfully with utf8 and largeutf8 -query T -SELECT - concat_ws(',', column1_utf8, column2_large_utf8) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Should run CONCAT_WS successfully with utf8 and utf8view -query T -SELECT - concat_ws(',', column1_utf8view, column2_utf8) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Should run CONCAT_WS successfully with largeutf8 and utf8view -query T -SELECT - concat_ws(',', column1_utf8view, column2_large_utf8) as c -FROM test; ----- -Andrew,X -Xiangpeng,Xiangpeng -Raphael,R -R - -## Ensure no casts for LPAD -query TT -EXPLAIN SELECT - LPAD(column1_utf8view, 12, ' ') as c1 -FROM test; ----- -logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8(" ")) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query TT -EXPLAIN SELECT - LPAD(column1_utf8view, 12, column2_large_utf8) as c1 -FROM test; ----- -logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 -02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] - -query TT -EXPLAIN SELECT - LPAD(column1_utf8view, 12, column2_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for OCTET_LENGTH -query TT -EXPLAIN SELECT - OCTET_LENGTH(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: octet_length(test.column1_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for OVERLAY -query TT -EXPLAIN SELECT - OVERLAY(column1_utf8view PLACING 'foo' FROM 2 ) as c1 -FROM test; ----- -logical_plan -01)Projection: overlay(test.column1_utf8view, Utf8View("foo"), Int64(2)) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query T -SELECT OVERLAY(column1_utf8view PLACING 'foo' FROM 2 ) as c1 FROM test; ----- -Afooew -Xfoogpeng -Rfooael -NULL - -## Ensure no casts for REGEXP_LIKE -query TT -EXPLAIN SELECT - REGEXP_LIKE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k -FROM test; ----- -logical_plan -01)Projection: regexp_like(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for REGEXP_MATCH -query TT -EXPLAIN SELECT - REGEXP_MATCH(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k -FROM test; ----- -logical_plan -01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for REGEXP_REPLACE -query TT -EXPLAIN SELECT - REGEXP_REPLACE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k -FROM test; ----- -logical_plan -01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for REPEAT -query TT -EXPLAIN SELECT - REPEAT(column1_utf8view, 2) as c1 -FROM test; ----- -logical_plan -01)Projection: repeat(test.column1_utf8view, Int64(2)) AS c1 -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for REPLACE -query TT -EXPLAIN SELECT - REPLACE(column1_utf8view, 'foo', 'bar') as c1, - REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 -FROM test; ----- -logical_plan -01)Projection: replace(test.column1_utf8view, Utf8View("foo"), Utf8View("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8View("bar")) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -query TT -SELECT - REPLACE(column1_utf8view, 'foo', 'bar') as c1, - REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 -FROM test; ----- -Andrew Andrew -Xiangpeng bar -Raphael baraphael -NULL NULL - - -## Ensure no casts for REVERSE -query TT -EXPLAIN SELECT - REVERSE(column1_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: reverse(test.column1_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view] - - -## Ensure no casts for RIGHT -query TT -EXPLAIN SELECT - RIGHT(column1_utf8view, 3) as c2 -FROM test; ----- -logical_plan -01)Projection: right(test.column1_utf8view, Int64(3)) AS c2 -02)--TableScan: test projection=[column1_utf8view] - -# Test outputs of RIGHT -query TTT -SELECT - RIGHT(column1_utf8view, 3) as c1, - RIGHT(column1_utf8view, 0) as c2, - RIGHT(column1_utf8view, -3) as c3 -FROM test; ----- -rew (empty) rew -eng (empty) ngpeng -ael (empty) hael -NULL NULL NULL - -## Ensure no casts for LEFT -query TT -EXPLAIN SELECT - LEFT(column1_utf8view, 3) as c2 -FROM test; ----- -logical_plan -01)Projection: left(test.column1_utf8view, Int64(3)) AS c2 -02)--TableScan: test projection=[column1_utf8view] - -# Test outputs of LEFT -query TTT -SELECT - LEFT(column1_utf8view, 3) as c1, - LEFT(column1_utf8view, 0) as c2, - LEFT(column1_utf8view, -3) as c3 -FROM test; ----- -And (empty) And -Xia (empty) Xiangp -Rap (empty) Raph -NULL NULL NULL - -## Ensure no casts for RPAD -query TT -EXPLAIN SELECT - RPAD(column1_utf8view, 1) as c1, - RPAD(column1_utf8view, 2, column2_utf8view) as c2 -FROM test; ----- -logical_plan -01)Projection: rpad(test.column1_utf8view, Int64(1)) AS c1, rpad(test.column1_utf8view, Int64(2), test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -query TT -EXPLAIN SELECT - RPAD(column1_utf8view, 12, column2_large_utf8) as c1 -FROM test; ----- -logical_plan -01)Projection: rpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 -02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] - -query TT -EXPLAIN SELECT - RPAD(column1_utf8view, 12, column2_utf8view) as c1 -FROM test; ----- -logical_plan -01)Projection: rpad(test.column1_utf8view, Int64(12), test.column2_utf8view) AS c1 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for SPLIT_PART -query TT -EXPLAIN SELECT - SPLIT_PART(column1_utf8view, 'f', 1) as c1, - SPLIT_PART('testtesttest',column1_utf8view, 1) as c2 -FROM test; ----- -logical_plan -01)Projection: split_part(test.column1_utf8view, Utf8("f"), Int64(1)) AS c1, split_part(Utf8("testtesttest"), test.column1_utf8view, Int64(1)) AS c2 -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for STRPOS -query TT -EXPLAIN SELECT - STRPOS(column1_utf8view, 'f') as c, - STRPOS(column1_utf8view, column2_utf8view) as c2 -FROM test; ----- -logical_plan -01)Projection: strpos(test.column1_utf8view, Utf8("f")) AS c, strpos(test.column1_utf8view, test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for SUBSTR -query TT -EXPLAIN SELECT - SUBSTR(column1_utf8view, 1) as c, - SUBSTR(column1_utf8view, 1 ,2) as c2 -FROM test; ----- -logical_plan -01)Projection: substr(test.column1_utf8view, Int64(1)) AS c, substr(test.column1_utf8view, Int64(1), Int64(2)) AS c2 -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for SUBSTRINDEX -query TT -EXPLAIN SELECT - SUBSTR_INDEX(column1_utf8view, 'a', 1) as c, - SUBSTR_INDEX(column1_utf8view, 'a', 2) as c2 -FROM test; ----- -logical_plan -01)Projection: substr_index(test.column1_utf8view, Utf8View("a"), Int64(1)) AS c, substr_index(test.column1_utf8view, Utf8View("a"), Int64(2)) AS c2 -02)--TableScan: test projection=[column1_utf8view] - -query TT -SELECT - SUBSTR_INDEX(column1_utf8view, 'a', 1) as c, - SUBSTR_INDEX(column1_utf8view, 'a', 2) as c2 -FROM test; ----- -Andrew Andrew -Xi Xiangpeng -R Raph -NULL NULL - -## Ensure no casts on columns for STARTS_WITH -query TT -EXPLAIN SELECT - STARTS_WITH(column1_utf8view, 'foo') as c, - STARTS_WITH(column1_utf8view, column2_utf8view) as c2 -FROM test; ----- -logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] - -## Ensure no casts for TRANSLATE -query TT -EXPLAIN SELECT - TRANSLATE(column1_utf8view, 'foo', 'bar') as c -FROM test; ----- -logical_plan -01)Projection: translate(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c -02)--TableScan: test projection=[column1_utf8view] - -## Ensure no casts for FIND_IN_SET -query TT -EXPLAIN SELECT - FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c -FROM test; ----- -logical_plan -01)Projection: find_in_set(test.column1_utf8view, Utf8View("a,b,c,d")) AS c -02)--TableScan: test projection=[column1_utf8view] - -query I -SELECT - FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c -FROM test; ----- -0 -0 -0 -NULL - -# || mixed types -# expect all results to be the same for each row as they all have the same values -query TTTTTTTT -SELECT - column1_utf8view || column2_utf8view, - column1_utf8 || column2_utf8view, - column1_large_utf8 || column2_utf8view, - column1_dict || column2_utf8view, - -- reverse argument order - column2_utf8view || column1_utf8view, - column2_utf8view || column1_utf8, - column2_utf8view || column1_large_utf8, - column2_utf8view || column1_dict -FROM test; ----- -AndrewX AndrewX AndrewX AndrewX XAndrew XAndrew XAndrew XAndrew -XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng -RaphaelR RaphaelR RaphaelR RaphaelR RRaphael RRaphael RRaphael RRaphael -NULL NULL NULL NULL NULL NULL NULL NULL - -# || constants -# expect all results to be the same for each row as they all have the same values -query TTTTTTTT -SELECT - column1_utf8view || 'foo', - column1_utf8 || 'foo', - column1_large_utf8 || 'foo', - column1_dict || 'foo', - -- reverse argument order - 'foo' || column1_utf8view, - 'foo' || column1_utf8, - 'foo' || column1_large_utf8, - 'foo' || column1_dict -FROM test; ----- -Andrewfoo Andrewfoo Andrewfoo Andrewfoo fooAndrew fooAndrew fooAndrew fooAndrew -Xiangpengfoo Xiangpengfoo Xiangpengfoo Xiangpengfoo fooXiangpeng fooXiangpeng fooXiangpeng fooXiangpeng -Raphaelfoo Raphaelfoo Raphaelfoo Raphaelfoo fooRaphael fooRaphael fooRaphael fooRaphael -NULL NULL NULL NULL NULL NULL NULL NULL - -# || same type (column1 has null, so also tests NULL || NULL) -# expect all results to be the same for each row as they all have the same values -query TTT -SELECT - column1_utf8view || column1_utf8view, - column1_utf8 || column1_utf8, - column1_large_utf8 || column1_large_utf8 - -- Dictionary/Dictionary coercion doesn't work - -- https://github.com/apache/datafusion/issues/12101 - --column1_dict || column1_dict -FROM test; ----- -AndrewAndrew AndrewAndrew AndrewAndrew -XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng -RaphaelRaphael RaphaelRaphael RaphaelRaphael -NULL NULL NULL - -## Ensure no casts for binary operators -## TODO: https://github.com/apache/datafusion/issues/12180 -# `~` operator (regex match) -query TT -EXPLAIN SELECT - column1_utf8view ~ 'an' AS c1 -FROM test; ----- -logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query B -SELECT - column1_utf8view ~ 'an' AS c1 -FROM test; ----- -false -true -false -NULL - -# `~*` operator (regex match case-insensitive) -query TT -EXPLAIN SELECT - column1_utf8view ~* '^a.{3}e' AS c1 -FROM test; ----- -logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query B -SELECT - column1_utf8view ~* '^a.{3}e' AS c1 -FROM test; ----- -true -false -false -NULL - -# `!~~` operator (not like match) -query TT -EXPLAIN SELECT - column1_utf8view !~~ 'xia_g%g' AS c1 -FROM test; ----- -logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query B -SELECT - column1_utf8view !~~ 'xia_g%g' AS c1 -FROM test; ----- -true -true -true -NULL - -# `!~~*` operator (not like match case-insensitive) -query TT -EXPLAIN SELECT - column1_utf8view !~~* 'xia_g%g' AS c1 -FROM test; ----- -logical_plan -01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1 -02)--TableScan: test projection=[column1_utf8view] - -query B -SELECT - column1_utf8view !~~* 'xia_g%g' AS c1 -FROM test; ----- -true -false -true -NULL - -statement ok -drop table test; - -# coercion from stringview to integer, as input to make_date -query D -select make_date(arrow_cast('2024', 'Utf8View'), arrow_cast('01', 'Utf8View'), arrow_cast('23', 'Utf8View')) ----- -2024-01-23 - -# coercions between stringview and date types -statement ok -create table dates (dt date) as values - (date '2024-01-23'), - (date '2023-11-30'); - -query D -select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; ----- -2024-01-23 - -statement ok -drop table dates; - -### Tests for `||` with Utf8View specifically - -statement ok -create table temp as values -('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), -('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); - -query TTT -select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from temp; ----- -Utf8 Utf8View Utf8View -Utf8 Utf8View Utf8View - -query T -select column2||' is fast' from temp; ----- -rust is fast -datafusion is fast - -query T -select column2 || ' is ' || column3 from temp; ----- -rust is fast -datafusion is cool - -query TT -explain select column2 || 'is' || column3 from temp; ----- -logical_plan -01)Projection: temp.column2 || Utf8View("is") || temp.column3 AS temp.column2 || Utf8("is") || temp.column3 -02)--TableScan: temp projection=[column2, column3] - -# should not cast the column2 to utf8 -query TT -explain select column2||' is fast' from temp; ----- -logical_plan -01)Projection: temp.column2 || Utf8View(" is fast") AS temp.column2 || Utf8(" is fast") -02)--TableScan: temp projection=[column2] - - -query T -select column2||column3 from temp; ----- -rustfast -datafusioncool - -query TT -explain select column2||column3 from temp; ----- -logical_plan -01)Projection: temp.column2 || temp.column3 -02)--TableScan: temp projection=[column2, column3] - -query T -select column2|| ' ' ||column3 from temp; ----- -rust fast -datafusion cool From 064450fd883778752f30c12772b2be41c2b91897 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 23 Sep 2024 13:50:10 -0400 Subject: [PATCH 13/20] Avoid unecessary unsafe --- datafusion/functions/src/string/common.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 4f70374b74e3..f796d10c26fa 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -204,10 +204,7 @@ fn trim_and_append_str<'a>( if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { let trim_str = trim_func(src_str, characters); - // Safety: - // `trim_str` is computed from `str::trim_xxx_matches`, - // and its addr is ensured to be >= `origin_str`'s - let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; + let start = (src_str.as_bytes().len() - trim_str.as_bytes().len()) as u32; make_and_append_view(views_buf, null_builder, raw, trim_str, start); } else { From c2510de79192176abcb56ccf2d946fe2c5a1e369 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 24 Sep 2024 02:23:38 +0800 Subject: [PATCH 14/20] add unit test cases with a unlined string view output. --- datafusion/functions/src/string/btrim.rs | 16 ++++++++++++++++ datafusion/functions/src/string/ltrim.rs | 16 ++++++++++++++++ datafusion/functions/src/string/rtrim.rs | 17 ++++++++++++++++- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 05f9b227fb5d..0e992ff27fd3 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -124,6 +124,7 @@ mod tests { #[test] fn test_functions() { + // String view cases for checking normal logic test_function!( BTrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( @@ -185,6 +186,21 @@ mod tests { Utf8View, StringViewArray ); + // Special string view case for checking unlined output(len > 12) + test_function!( + BTrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "xxxalphabetalphabetxxx" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("x")))), + ], + Ok(Some("alphabetalphabet")), + &str, + Utf8View, + StringViewArray + ); + // String cases test_function!( BTrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 123ecc33ce2f..0ddb5a205bac 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -119,6 +119,7 @@ mod tests { #[test] fn test_functions() { + // String view cases for checking normal logic test_function!( LtrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( @@ -180,6 +181,21 @@ mod tests { Utf8View, StringViewArray ); + // Special string view case for checking unlined output(len > 12) + test_function!( + LtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "xxxalphabetalphabet" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("x")))), + ], + Ok(Some("alphabetalphabet")), + &str, + Utf8View, + StringViewArray + ); + // String cases test_function!( LtrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 51df77e402b0..a1aa5568babb 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -119,6 +119,7 @@ mod tests { #[test] fn test_functions() { + // String view cases for checking normal logic test_function!( RtrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8View(Some( @@ -180,7 +181,21 @@ mod tests { Utf8View, StringViewArray ); - + // Special string view case for checking unlined output(len > 12) + test_function!( + RtrimFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "alphabetalphabetxxx" + )))), + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("x")))), + ], + Ok(Some("alphabetalphabet")), + &str, + Utf8View, + StringViewArray + ); + // String cases test_function!( RtrimFunc::new(), &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( From 38790b21cebdc7fab402e4d7d240cf92d87edd21 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 24 Sep 2024 02:36:33 +0800 Subject: [PATCH 15/20] fix tests. --- datafusion/functions/src/string/common.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index f796d10c26fa..4f70374b74e3 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -204,7 +204,10 @@ fn trim_and_append_str<'a>( if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { let trim_str = trim_func(src_str, characters); - let start = (src_str.as_bytes().len() - trim_str.as_bytes().len()) as u32; + // Safety: + // `trim_str` is computed from `str::trim_xxx_matches`, + // and its addr is ensured to be >= `origin_str`'s + let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; make_and_append_view(views_buf, null_builder, raw, trim_str, start); } else { From 20197d90b70af00c804f4a947fb00efb9a18d499 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 24 Sep 2024 02:39:13 +0800 Subject: [PATCH 16/20] improve comments. --- datafusion/functions/src/string/common.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 4f70374b74e3..817a8ebbeeb7 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -33,8 +33,11 @@ use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; use datafusion_expr::ColumnarValue; -/// Make a `u128` based on the given substr, start(offset to view.offset), and -/// push into to the given buffers +/// Append a new view to the views buffer with the given substr +/// +/// raw must be a valid view +/// substr must be a valid substring of raw +/// start must be less than or equal to the length of the string data pub(crate) fn make_and_append_view( views_buffer: &mut Vec, null_builder: &mut NullBufferBuilder, From 2112bc53575e4f90b9352b2f8cc8b4fed62999c3 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 24 Sep 2024 02:48:13 +0800 Subject: [PATCH 17/20] add todo and the related issue. --- datafusion/functions/src/string/common.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 817a8ebbeeb7..0e1dfcd94376 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -210,6 +210,10 @@ fn trim_and_append_str<'a>( // Safety: // `trim_str` is computed from `str::trim_xxx_matches`, // and its addr is ensured to be >= `origin_str`'s + // + // TODO: remove the unsafe codes once `Pattern` get stable, related issue: + // https://github.com/apache/datafusion/issues/12597 + // let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; make_and_append_view(views_buf, null_builder, raw, trim_str, start); From 790f7a93bd4b2c51b1119f849d0e086aa73c2168 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 26 Sep 2024 00:44:05 +0800 Subject: [PATCH 18/20] use the safe way to get `start_offset` after trimming. --- datafusion/functions/src/string/common.rs | 67 ++++++++++++------- .../src/aggregates/group_values/row.rs | 1 + 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 0e1dfcd94376..46f92fcbdf3c 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -43,12 +43,16 @@ pub(crate) fn make_and_append_view( null_builder: &mut NullBufferBuilder, raw_view: &u128, substr: &str, - start: u32, + start_offset: u32, ) { let substr_len = substr.len(); let sub_view = if substr_len > 12 { let view = ByteView::from(*raw_view); - make_view(substr.as_bytes(), view.buffer_index, view.offset + start) + make_view( + substr.as_bytes(), + view.buffer_index, + view.offset + start_offset, + ) } else { // inline value does not need block id or offset make_view(substr.as_bytes(), 0, 0) @@ -78,21 +82,42 @@ pub(crate) fn general_trim( trim_type: TrimType, use_string_view: bool, ) -> Result { + // This is the function used to trim each string row, and it will return: + // - trimmed str + // e.g. ltrim(" abc") -> "abc" + // + // - start offset, needed in `string_view_trim` + // e.g. "abc" actually is " abc"[2..], and the start offset here should be 2 + // let func = match trim_type { TrimType::Left => |input, pattern: &str| { let pattern = pattern.chars().collect::>(); - str::trim_start_matches::<&[char]>(input, pattern.as_ref()) + let ltrimmed_str = + str::trim_start_matches::<&[char]>(input, pattern.as_ref()); + // `ltrimmed_str` is actually `input`[start_offset..], so `start_offset`, + // so `start_offset` = len(`input`) - len(`ltrimmed_str`) + let start_offset = input.as_bytes().len() - ltrimmed_str.as_bytes().len(); + + (ltrimmed_str, start_offset as u32) }, TrimType::Right => |input, pattern: &str| { let pattern = pattern.chars().collect::>(); - str::trim_end_matches::<&[char]>(input, pattern.as_ref()) + let rtrimmed_str = str::trim_end_matches::<&[char]>(input, pattern.as_ref()); + + // `ltrimmed_str` is actually `input`[0..new_len], so `start_offset` is 0 + (rtrimmed_str, 0) }, TrimType::Both => |input, pattern: &str| { let pattern = pattern.chars().collect::>(); - str::trim_end_matches::<&[char]>( - str::trim_start_matches::<&[char]>(input, pattern.as_ref()), - pattern.as_ref(), - ) + let ltrimmed_str = + str::trim_start_matches::<&[char]>(input, pattern.as_ref()); + // `btrimmed_str` is actually rtrim(ltrim(`input`)), + // so its `start_offset` can be computed as ltrim one + let start_offset = input.as_bytes().len() - ltrimmed_str.as_bytes().len(); + let btrimmed_str = + str::trim_end_matches::<&[char]>(ltrimmed_str, pattern.as_ref()); + + (btrimmed_str, start_offset as u32) }, }; @@ -105,7 +130,7 @@ pub(crate) fn general_trim( // removing 'a will cause compiler complaining lifetime of `func` fn string_view_trim<'a>( - trim_func: fn(&'a str, &'a str) -> &'a str, + trim_func: fn(&'a str, &'a str) -> (&'a str, u32), args: &'a [ArrayRef], ) -> Result { let string_view_array = as_string_view_array(&args[0])?; @@ -199,24 +224,14 @@ fn string_view_trim<'a>( fn trim_and_append_str<'a>( src_str_opt: Option<&'a str>, trim_characters_opt: Option<&'a str>, - trim_func: fn(&'a str, &'a str) -> &'a str, + trim_func: fn(&'a str, &'a str) -> (&'a str, u32), views_buf: &mut Vec, null_builder: &mut NullBufferBuilder, raw: &u128, ) { if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { - let trim_str = trim_func(src_str, characters); - - // Safety: - // `trim_str` is computed from `str::trim_xxx_matches`, - // and its addr is ensured to be >= `origin_str`'s - // - // TODO: remove the unsafe codes once `Pattern` get stable, related issue: - // https://github.com/apache/datafusion/issues/12597 - // - let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; - - make_and_append_view(views_buf, null_builder, raw, trim_str, start); + let (trim_str, start_offset) = trim_func(src_str, characters); + make_and_append_view(views_buf, null_builder, raw, trim_str, start_offset); } else { null_builder.append_null(); views_buf.push(0); @@ -224,7 +239,7 @@ fn trim_and_append_str<'a>( } fn string_trim<'a, T: OffsetSizeTrait>( - func: fn(&'a str, &'a str) -> &'a str, + func: fn(&'a str, &'a str) -> (&'a str, u32), args: &'a [ArrayRef], ) -> Result { let string_array = as_generic_string_array::(&args[0])?; @@ -233,7 +248,7 @@ fn string_trim<'a, T: OffsetSizeTrait>( 1 => { let result = string_array .iter() - .map(|string| string.map(|string: &str| func(string, " "))) + .map(|string| string.map(|string: &str| func(string, " ").0)) .collect::>(); Ok(Arc::new(result) as ArrayRef) @@ -252,7 +267,7 @@ fn string_trim<'a, T: OffsetSizeTrait>( let characters = characters_array.value(0); let result = string_array .iter() - .map(|item| item.map(|string| func(string, characters))) + .map(|item| item.map(|string| func(string, characters).0)) .collect::>(); return Ok(Arc::new(result) as ArrayRef); } @@ -261,7 +276,7 @@ fn string_trim<'a, T: OffsetSizeTrait>( .iter() .zip(characters_array.iter()) .map(|(string, characters)| match (string, characters) { - (Some(string), Some(characters)) => Some(func(string, characters)), + (Some(string), Some(characters)) => Some(func(string, characters).0), _ => None, }) .collect::>(); diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs index dc948e28bb2d..6e8b12660aa0 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/row.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs @@ -82,6 +82,7 @@ impl GroupValuesRows { let map = RawTable::with_capacity(0); let starting_rows_capacity = 1000; + let starting_data_capacity = 64 * starting_rows_capacity; let rows_buffer = row_converter.empty_rows(starting_rows_capacity, starting_data_capacity); From f8174626e47d147e90c6715f5052ccfa269f0493 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 26 Sep 2024 00:46:40 +0800 Subject: [PATCH 19/20] fix comments. --- datafusion/functions/src/string/common.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 46f92fcbdf3c..5c413b9c9afb 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -94,7 +94,7 @@ pub(crate) fn general_trim( let pattern = pattern.chars().collect::>(); let ltrimmed_str = str::trim_start_matches::<&[char]>(input, pattern.as_ref()); - // `ltrimmed_str` is actually `input`[start_offset..], so `start_offset`, + // `ltrimmed_str` is actually `input`[start_offset..], // so `start_offset` = len(`input`) - len(`ltrimmed_str`) let start_offset = input.as_bytes().len() - ltrimmed_str.as_bytes().len(); @@ -111,8 +111,8 @@ pub(crate) fn general_trim( let pattern = pattern.chars().collect::>(); let ltrimmed_str = str::trim_start_matches::<&[char]>(input, pattern.as_ref()); - // `btrimmed_str` is actually rtrim(ltrim(`input`)), - // so its `start_offset` can be computed as ltrim one + // `btrimmed_str` can be got by rtrim(ltrim(`input`)), + // so its `start_offset` should be same as ltrim situation above let start_offset = input.as_bytes().len() - ltrimmed_str.as_bytes().len(); let btrimmed_str = str::trim_end_matches::<&[char]>(ltrimmed_str, pattern.as_ref()); From f9c1543602073a242de01f3c5f5048bdcb3de1f7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 25 Sep 2024 14:54:24 -0400 Subject: [PATCH 20/20] Remove redundant test --- .../test_files/string/string_view.slt | 88 ------------------- 1 file changed, 88 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 6303caac60d3..e7b55c9c1c8c 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -982,93 +982,5 @@ logical_plan 01)Projection: temp.column2 || temp.column3 02)--TableScan: temp projection=[column2, column3] -################################################ -# Test for string view trim -################################################ -# Test BTRIM outputs -query TTTT -SELECT - BTRIM(column1_utf8view, 'foo') AS l1, - BTRIM(column1_utf8view, 'A') AS l2, - BTRIM(column1_utf8view) AS l3, - BTRIM(column1_utf8view, NULL) AS l4 -FROM test; ----- -Andrew ndrew Andrew NULL -Xiangpeng Xiangpeng Xiangpeng NULL -Raphael Raphael Raphael NULL -NULL NULL NULL NULL - -query TTTT -SELECT - arrow_typeof(BTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(BTRIM(column1_utf8view, 'A')) AS t2, - arrow_typeof(BTRIM(column1_utf8view)) AS t3, - arrow_typeof(BTRIM(column1_utf8view, NULL)) AS t4 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View - -# Test LTRIM outputs -query TTTTT -SELECT - LTRIM(column1_utf8view, 'foo') AS l1, - LTRIM(column1_utf8view, column2_utf8view) AS l2, - LTRIM(column1_utf8view) AS l3, - LTRIM(column1_utf8view, NULL) AS l4, - LTRIM(column1_utf8view, 'Xiang') AS l5 -FROM test; ----- -Andrew Andrew Andrew NULL Andrew -Xiangpeng (empty) Xiangpeng NULL peng -Raphael aphael Raphael NULL Raphael -NULL NULL NULL NULL NULL - -query TTTTT -SELECT - arrow_typeof(LTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(LTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(LTRIM(column1_utf8view)) AS t3, - arrow_typeof(LTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(LTRIM(column1_utf8view, 'Xiang')) AS t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - -# Test RTRIM outputs -query TTTTT -SELECT - RTRIM(column1_utf8view, 'foo') AS l1, - RTRIM(column1_utf8view, column2_utf8view) AS l2, - RTRIM(column1_utf8view) AS l3, - RTRIM(column1_utf8view, NULL) AS l4, - RTRIM(column1_utf8view, 'peng') As l5 -FROM test; ----- -Andrew Andrew Andrew NULL Andrew -Xiangpeng (empty) Xiangpeng NULL Xia -Raphael Raphael Raphael NULL Raphael -NULL NULL NULL NULL NULL - -query TTTTT -SELECT - arrow_typeof(RTRIM(column1_utf8view, 'foo')) AS t1, - arrow_typeof(RTRIM(column1_utf8view, column2_utf8view)) AS t2, - arrow_typeof(RTRIM(column1_utf8view)) AS t3, - arrow_typeof(RTRIM(column1_utf8view, NULL)) AS t4, - arrow_typeof(RTRIM(column1_utf8view, 'peng')) As t5 -FROM test; ----- -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View -Utf8View Utf8View Utf8View Utf8View Utf8View - statement ok drop table test