-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of trim
for string view (10%)
#12395
Changes from 15 commits
736eb11
a0da2d0
3c8b035
06d104d
48cb4db
863e9b7
36a8125
aa2c131
e3e9b53
dbd0f25
6d5660f
65ef988
4129a43
522c87f
840ec46
307850a
064450f
c2510de
38790b2
20197d9
2112bc5
790f7a9
f817462
148a991
f9c1543
e39f916
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -21,17 +21,39 @@ use std::fmt::{Display, Formatter}; | |||||
use std::sync::Arc; | ||||||
|
||||||
use arrow::array::{ | ||||||
new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, | ||||||
GenericStringArray, GenericStringBuilder, LargeStringArray, OffsetSizeTrait, | ||||||
StringArray, StringBuilder, StringViewArray, StringViewBuilder, | ||||||
make_view, new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, | ||||||
ArrayRef, ByteView, GenericStringArray, GenericStringBuilder, LargeStringArray, | ||||||
OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, StringViewBuilder, | ||||||
}; | ||||||
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; | ||||||
use arrow::datatypes::DataType; | ||||||
use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; | ||||||
use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; | ||||||
use datafusion_common::Result; | ||||||
use datafusion_common::{exec_err, ScalarValue}; | ||||||
use datafusion_expr::ColumnarValue; | ||||||
|
||||||
/// Make a `u128` based on the given substr, start(offset to view.offset), and | ||||||
/// push into to the given buffers | ||||||
Rachelint marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
pub(crate) fn make_and_append_view( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤔 I wonder if we should (as a follow on PR) propose adding this upstream to arrow-rs as it seems valuable for any trim related kernels on stringview There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It sounds great! and #12383 (comment) can be solved if it is function in arrow-rs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like what would be really useful is a I started to write a ticket in arrow-rs but I didn't know exactly what API to suggest. I think we would have to try it out |
||||||
views_buffer: &mut Vec<u128>, | ||||||
null_builder: &mut NullBufferBuilder, | ||||||
raw_view: &u128, | ||||||
substr: &str, | ||||||
start: u32, | ||||||
) { | ||||||
let substr_len = substr.len(); | ||||||
let sub_view = if substr_len > 12 { | ||||||
let view = ByteView::from(*raw_view); | ||||||
make_view(substr.as_bytes(), view.buffer_index, view.offset + start) | ||||||
} else { | ||||||
// inline value does not need block id or offset | ||||||
make_view(substr.as_bytes(), 0, 0) | ||||||
}; | ||||||
views_buffer.push(sub_view); | ||||||
null_builder.append_non_null(); | ||||||
} | ||||||
|
||||||
pub(crate) enum TrimType { | ||||||
Left, | ||||||
Right, | ||||||
|
@@ -72,65 +94,126 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>( | |||||
}; | ||||||
|
||||||
if use_string_view { | ||||||
string_view_trim::<T>(func, args) | ||||||
string_view_trim(func, args) | ||||||
} else { | ||||||
string_trim::<T>(func, args) | ||||||
} | ||||||
} | ||||||
|
||||||
// removing 'a will cause compiler complaining lifetime of `func` | ||||||
fn string_view_trim<'a, T: OffsetSizeTrait>( | ||||||
func: fn(&'a str, &'a str) -> &'a str, | ||||||
fn string_view_trim<'a>( | ||||||
trim_func: fn(&'a str, &'a str) -> &'a str, | ||||||
args: &'a [ArrayRef], | ||||||
) -> Result<ArrayRef> { | ||||||
let string_array = as_string_view_array(&args[0])?; | ||||||
let string_view_array = as_string_view_array(&args[0])?; | ||||||
let mut views_buf = Vec::with_capacity(string_view_array.len()); | ||||||
let mut null_builder = NullBufferBuilder::new(string_view_array.len()); | ||||||
alamb marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
match args.len() { | ||||||
1 => { | ||||||
let result = string_array | ||||||
.iter() | ||||||
.map(|string| string.map(|string: &str| func(string, " "))) | ||||||
.collect::<GenericStringArray<T>>(); | ||||||
|
||||||
Ok(Arc::new(result) as ArrayRef) | ||||||
let array_iter = string_view_array.iter(); | ||||||
let views_iter = string_view_array.views().iter(); | ||||||
for (src_str_opt, raw_view) in array_iter.zip(views_iter) { | ||||||
trim_and_append_str( | ||||||
src_str_opt, | ||||||
Some(" "), | ||||||
trim_func, | ||||||
&mut views_buf, | ||||||
&mut null_builder, | ||||||
raw_view, | ||||||
); | ||||||
} | ||||||
} | ||||||
2 => { | ||||||
let characters_array = as_string_view_array(&args[1])?; | ||||||
|
||||||
if characters_array.len() == 1 { | ||||||
// Only one `trim characters` exist | ||||||
if characters_array.is_null(0) { | ||||||
return Ok(new_null_array( | ||||||
// The schema is expecting utf8 as null | ||||||
&DataType::Utf8, | ||||||
string_array.len(), | ||||||
&DataType::Utf8View, | ||||||
string_view_array.len(), | ||||||
)); | ||||||
} | ||||||
|
||||||
let characters = characters_array.value(0); | ||||||
let result = string_array | ||||||
.iter() | ||||||
.map(|item| item.map(|string| func(string, characters))) | ||||||
.collect::<GenericStringArray<T>>(); | ||||||
return Ok(Arc::new(result) as ArrayRef); | ||||||
let array_iter = string_view_array.iter(); | ||||||
let views_iter = string_view_array.views().iter(); | ||||||
for (src_str_opt, raw_view) in array_iter.zip(views_iter) { | ||||||
trim_and_append_str( | ||||||
src_str_opt, | ||||||
Some(characters), | ||||||
trim_func, | ||||||
&mut views_buf, | ||||||
&mut null_builder, | ||||||
raw_view, | ||||||
); | ||||||
} | ||||||
} else { | ||||||
// A specific `trim characters` for a row in the string view array | ||||||
let characters_iter = characters_array.iter(); | ||||||
let array_iter = string_view_array.iter(); | ||||||
let views_iter = string_view_array.views().iter(); | ||||||
for ((src_str_opt, raw_view), characters_opt) in | ||||||
array_iter.zip(views_iter).zip(characters_iter) | ||||||
{ | ||||||
trim_and_append_str( | ||||||
src_str_opt, | ||||||
characters_opt, | ||||||
trim_func, | ||||||
&mut views_buf, | ||||||
&mut null_builder, | ||||||
raw_view, | ||||||
); | ||||||
} | ||||||
} | ||||||
|
||||||
let result = string_array | ||||||
.iter() | ||||||
.zip(characters_array.iter()) | ||||||
.map(|(string, characters)| match (string, characters) { | ||||||
(Some(string), Some(characters)) => Some(func(string, characters)), | ||||||
_ => None, | ||||||
}) | ||||||
.collect::<GenericStringArray<T>>(); | ||||||
|
||||||
Ok(Arc::new(result) as ArrayRef) | ||||||
} | ||||||
other => { | ||||||
exec_err!( | ||||||
return exec_err!( | ||||||
"Function TRIM was called with {other} arguments. It requires at least 1 and at most 2." | ||||||
) | ||||||
); | ||||||
} | ||||||
} | ||||||
|
||||||
let views_buf = ScalarBuffer::from(views_buf); | ||||||
let nulls_buf = null_builder.finish(); | ||||||
|
||||||
// Safety: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again I think using StringBuilder here might improve performance There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I am seeking way to make It is really meaningful to support this ability for more There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related discussion: apache/arrow-rs#6430 |
||||||
// (1) The blocks of the given views are all provided | ||||||
// (2) Each of the range `view.offset+start..end` of view in views_buf is within | ||||||
// the bounds of each of the blocks | ||||||
unsafe { | ||||||
let array = StringViewArray::new_unchecked( | ||||||
views_buf, | ||||||
string_view_array.data_buffers().to_vec(), | ||||||
nulls_buf, | ||||||
); | ||||||
Ok(Arc::new(array) as ArrayRef) | ||||||
} | ||||||
} | ||||||
|
||||||
fn trim_and_append_str<'a>( | ||||||
src_str_opt: Option<&'a str>, | ||||||
trim_characters_opt: Option<&'a str>, | ||||||
trim_func: fn(&'a str, &'a str) -> &'a str, | ||||||
views_buf: &mut Vec<u128>, | ||||||
null_builder: &mut NullBufferBuilder, | ||||||
raw: &u128, | ||||||
) { | ||||||
if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) { | ||||||
let trim_str = trim_func(src_str, characters); | ||||||
|
||||||
// Safety: | ||||||
// `trim_str` is computed from `str::trim_xxx_matches`, | ||||||
// and its addr is ensured to be >= `origin_str`'s | ||||||
let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 }; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you need the length of this in bytes, I don't think you need unsafe here How about this:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I want to get the
Maybe wen can only make it using unsafe temporarily #12387 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I ran this diff: diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs
index 4f70374b7..f796d10c2 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -204,10 +204,7 @@ fn trim_and_append_str<'a>(
if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) {
let trim_str = trim_func(src_str, characters);
- // Safety:
- // `trim_str` is computed from `str::trim_xxx_matches`,
- // and its addr is ensured to be >= `origin_str`'s
- let start = unsafe { trim_str.as_ptr().offset_from(src_str.as_ptr()) as u32 };
+ let start = (src_str.as_bytes().len() - trim_str.as_bytes().len()) as u32;
make_and_append_view(views_buf, null_builder, raw, trim_str, start);
} else { And all tests passed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for my carelessness, I have added new unit test cases with the unlined string view output (len > 12). I guess maybe we can't remove the unsafe codes until the feature I will file a issue to check it. |
||||||
|
||||||
make_and_append_view(views_buf, null_builder, raw, trim_str, start); | ||||||
} else { | ||||||
null_builder.append_null(); | ||||||
views_buf.push(0); | ||||||
} | ||||||
} | ||||||
|
||||||
fn string_trim<'a, T: OffsetSizeTrait>( | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
Also eventually it would also be possible to return
Utf8View
when the input wasUtf8
and save a copy as well