Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement string_to_array #7577

Merged
merged 3 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ pub enum BuiltinScalarFunction {
SHA512,
/// split_part
SplitPart,
/// string_to_array
StringToArray,
/// starts_with
StartsWith,
/// strpos
Expand Down Expand Up @@ -426,6 +428,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SHA512 => Volatility::Immutable,
BuiltinScalarFunction::Digest => Volatility::Immutable,
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
BuiltinScalarFunction::StringToArray => Volatility::Immutable,
BuiltinScalarFunction::StartsWith => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
Expand Down Expand Up @@ -711,6 +714,11 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SplitPart => {
utf8_to_str_type(&input_expr_types[0], "split_part")
}
BuiltinScalarFunction::StringToArray => Ok(List(Arc::new(Field::new(
"item",
input_expr_types[0].clone(),
true,
)))),
BuiltinScalarFunction::StartsWith => Ok(Boolean),
BuiltinScalarFunction::Strpos => {
utf8_to_int_type(&input_expr_types[0], "strpos")
Expand Down Expand Up @@ -1067,7 +1075,13 @@ impl BuiltinScalarFunction {
],
self.volatility(),
),

BuiltinScalarFunction::StringToArray => Signature::one_of(
vec![
TypeSignature::Uniform(2, vec![Utf8, LargeUtf8]),
TypeSignature::Uniform(3, vec![Utf8, LargeUtf8]),
],
self.volatility(),
),
BuiltinScalarFunction::Strpos | BuiltinScalarFunction::StartsWith => {
Signature::one_of(
vec![
Expand Down Expand Up @@ -1277,6 +1291,7 @@ fn aliases(func: &BuiltinScalarFunction) -> &'static [&'static str] {
BuiltinScalarFunction::Rpad => &["rpad"],
BuiltinScalarFunction::Rtrim => &["rtrim"],
BuiltinScalarFunction::SplitPart => &["split_part"],
BuiltinScalarFunction::StringToArray => &["string_to_array", "string_to_list"],
BuiltinScalarFunction::StartsWith => &["starts_with"],
BuiltinScalarFunction::Strpos => &["strpos"],
BuiltinScalarFunction::Substr => &["substr"],
Expand Down
2 changes: 2 additions & 0 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,7 @@ scalar_expr!(SHA256, sha256, string, "SHA-256 hash");
scalar_expr!(SHA384, sha384, string, "SHA-384 hash");
scalar_expr!(SHA512, sha512, string, "SHA-512 hash");
scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index.");
scalar_expr!(StringToArray, string_to_array, string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`");
scalar_expr!(StartsWith, starts_with, string prefix, "whether the `string` starts with the `prefix`");
scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`");
scalar_expr!(Substr, substr, string position, "substring from the `position` to the end");
Expand Down Expand Up @@ -1080,6 +1081,7 @@ mod test {
test_scalar_expr!(SHA384, sha384, string);
test_scalar_expr!(SHA512, sha512, string);
test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
test_scalar_expr!(StringToArray, string_to_array, expr, delimiter, null_value);
test_scalar_expr!(StartsWith, starts_with, string, characters);
test_scalar_expr!(Strpos, strpos, string, substring);
test_scalar_expr!(Substr, substr, string, position);
Expand Down
89 changes: 89 additions & 0 deletions datafusion/physical-expr/src/array_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1864,6 +1864,95 @@ pub fn array_has_all(args: &[ArrayRef]) -> Result<ArrayRef> {
Ok(Arc::new(boolean_builder.finish()))
}

/// Splits string at occurrences of delimiter and returns an array of parts
/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]'
pub fn string_to_array<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let delimiter_array = as_generic_string_array::<T>(&args[1])?;

let mut list_builder = ListBuilder::new(StringBuilder::with_capacity(
string_array.len(),
string_array.get_buffer_memory_size(),
));

match args.len() {
2 => {
string_array.iter().zip(delimiter_array.iter()).for_each(
|(string, delimiter)| {
match (string, delimiter) {
(Some(string), Some("")) => {
list_builder.values().append_value(string);
list_builder.append(true);
}
(Some(string), Some(delimiter)) => {
string.split(delimiter).for_each(|s| {
list_builder.values().append_value(s);
});
list_builder.append(true);
}
(Some(string), None) => {
string.chars().map(|c| c.to_string()).for_each(|c| {
list_builder.values().append_value(c);
});
list_builder.append(true);
}
_ => list_builder.append(false), // null value
}
},
);
}

3 => {
let null_value_array = as_generic_string_array::<T>(&args[2])?;
string_array
.iter()
.zip(delimiter_array.iter())
.zip(null_value_array.iter())
.for_each(|((string, delimiter), null_value)| {
match (string, delimiter) {
(Some(string), Some("")) => {
if Some(string) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(string);
}
list_builder.append(true);
}
(Some(string), Some(delimiter)) => {
string.split(delimiter).for_each(|s| {
if Some(s) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(s);
}
});
list_builder.append(true);
}
(Some(string), None) => {
string.chars().map(|c| c.to_string()).for_each(|c| {
if Some(c.as_str()) == null_value {
list_builder.values().append_null();
} else {
list_builder.values().append_value(c);
}
});
list_builder.append(true);
}
_ => list_builder.append(false), // null value
}
});
}
_ => {
return internal_err!(
"Expect string_to_array function to take two or three parameters"
)
}
}

let list_array = list_builder.finish();
Ok(Arc::new(list_array) as ArrayRef)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
15 changes: 15 additions & 0 deletions datafusion/physical-expr/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,21 @@ pub fn create_physical_fun(
internal_err!("Unsupported data type {other:?} for function split_part")
}
}),
BuiltinScalarFunction::StringToArray => {
Arc::new(|args| match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(array_expressions::string_to_array::<i32>)(args)
}
DataType::LargeUtf8 => {
make_scalar_function(array_expressions::string_to_array::<i64>)(args)
}
other => {
internal_err!(
"Unsupported data type {other:?} for function string_to_array"
)
}
})
}
BuiltinScalarFunction::StartsWith => Arc::new(|args| match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(string_expressions::starts_with::<i32>)(args)
Expand Down
1 change: 1 addition & 0 deletions datafusion/proto/proto/datafusion.proto
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,7 @@ enum ScalarFunction {
Iszero = 114;
ArrayEmpty = 115;
ArrayPopBack = 116;
StringToArray = 117;
}

message ScalarFunctionNode {
Expand Down
3 changes: 3 additions & 0 deletions datafusion/proto/src/generated/pbjson.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions datafusion/proto/src/generated/prost.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/proto/src/logical_plan/from_proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction {
ScalarFunction::Right => Self::Right,
ScalarFunction::Rpad => Self::Rpad,
ScalarFunction::SplitPart => Self::SplitPart,
ScalarFunction::StringToArray => Self::StringToArray,
ScalarFunction::StartsWith => Self::StartsWith,
ScalarFunction::Strpos => Self::Strpos,
ScalarFunction::Substr => Self::Substr,
Expand Down
1 change: 1 addition & 0 deletions datafusion/proto/src/logical_plan/to_proto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1518,6 +1518,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction {
BuiltinScalarFunction::Right => Self::Right,
BuiltinScalarFunction::Rpad => Self::Rpad,
BuiltinScalarFunction::SplitPart => Self::SplitPart,
BuiltinScalarFunction::StringToArray => Self::StringToArray,
BuiltinScalarFunction::StartsWith => Self::StartsWith,
BuiltinScalarFunction::Strpos => Self::Strpos,
BuiltinScalarFunction::Substr => Self::Substr,
Expand Down
46 changes: 46 additions & 0 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2485,6 +2485,52 @@ NULL
false
false

query ?
SELECT string_to_array('abcxxxdef', 'xxx')
----
[abc, def]

query ?
SELECT string_to_array('abc', '')
----
[abc]

query ?
SELECT string_to_array('abc', NULL)
----
[a, b, c]

query ?
SELECT string_to_array('abc def', ' ', 'def')
----
[abc, ]

query ?
select string_to_array(e, ',') from values;
----
[Lorem]
[ipsum]
[dolor]
[sit]
[amet]
[, ]
[consectetur]
[adipiscing]
NULL

query ?
select string_to_list(e, 'm') from values;
----
[Lore, ]
[ipsu, ]
[dolor]
[sit]
[a, et]
[,]
[consectetur]
[adipiscing]
NULL

### Delete tables

statement ok
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/functions.slt
Original file line number Diff line number Diff line change
Expand Up @@ -810,4 +810,4 @@ SELECT products.* REPLACE (price*2 AS price, product_id+1000 AS product_id) FROM
1001 OldBrand Product 1 39.98
1002 OldBrand Product 2 59.98
1003 OldBrand Product 3 79.98
1004 OldBrand Product 4 99.98
1004 OldBrand Product 4 99.98
24 changes: 24 additions & 0 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,8 @@ from_unixtime(expression)
- [list_to_string](#list_to_string)
- [make_array](#make_array)
- [make_list](#make_list)
- [string_to_array](#string_to_array)
- [string_to_list](#string_to_list)
- [trim_array](#trim_array)

### `array_append`
Expand Down Expand Up @@ -2369,6 +2371,28 @@ make_array(expression1[, ..., expression_n])

_Alias of [make_array](#make_array)._

### `string_to_array`

Splits a string in to an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.

```
starts_with(str, delimiter[, null_str])
```

#### Arguments

- **str**: String expression to split.
- **delimiter**: Delimiter string to split on.
- **null_str**: Substring values to be replaced with `NULL`

#### Aliases

- string_to_list

### `string_to_list`

_Alias of [string_to_array](#string_to_array)._

### `trim_array`

Removes the last n elements from the array.
Expand Down