Skip to content

Commit

Permalink
Speed up chr UDF (~4x faster) (#14700)
Browse files Browse the repository at this point in the history
* add chr bench

* speed up chr

* 1 byte assumption
  • Loading branch information
simonvandel authored Feb 19, 2025
1 parent 481515e commit 6a036ae
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 20 deletions.
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ harness = false
name = "encoding"
required-features = ["encoding_expressions"]

[[bench]]
harness = false
name = "chr"
required-features = ["string_expressions"]

[[bench]]
harness = false
name = "uuid"
Expand Down
52 changes: 52 additions & 0 deletions datafusion/functions/benches/chr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use arrow::{array::PrimitiveArray, datatypes::Int64Type, util::test_util::seedable_rng};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::ColumnarValue;
use datafusion_functions::string::chr;
use rand::Rng;

use std::sync::Arc;

fn criterion_benchmark(c: &mut Criterion) {
let cot_fn = chr();
let size = 1024;
let input: PrimitiveArray<Int64Type> = {
let null_density = 0.2;
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
Some(rng.gen_range::<i64, _>(1i64..10_000))
}
})
.collect()
};
let input = Arc::new(input);
let args = vec![ColumnarValue::Array(input)];
c.bench_function("chr", |b| {
b.iter(|| black_box(cot_fn.invoke_batch(&args, size).unwrap()))
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
53 changes: 33 additions & 20 deletions datafusion/functions/src/string/chr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use std::any::Any;
use std::sync::Arc;

use arrow::array::ArrayRef;
use arrow::array::StringArray;
use arrow::array::GenericStringBuilder;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::Int64;
use arrow::datatypes::DataType::Utf8;
Expand All @@ -36,26 +36,39 @@ use datafusion_macros::user_doc;
pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
let integer_array = as_int64_array(&args[0])?;

// first map is the iterator, second is for the `Option<_>`
let result = integer_array
.iter()
.map(|integer: Option<i64>| {
integer
.map(|integer| {
if integer == 0 {
exec_err!("null character not permitted.")
} else {
match core::char::from_u32(integer as u32) {
Some(integer) => Ok(integer.to_string()),
None => {
exec_err!("requested character too large for encoding.")
}
let mut builder = GenericStringBuilder::<i32>::with_capacity(
integer_array.len(),
// 1 byte per character, assuming that is the common case
integer_array.len(),
);

let mut buf = [0u8; 4];

for integer in integer_array {
match integer {
Some(integer) => {
if integer == 0 {
return exec_err!("null character not permitted.");
} else {
match core::char::from_u32(integer as u32) {
Some(c) => {
builder.append_value(c.encode_utf8(&mut buf));
}
None => {
return exec_err!(
"requested character too large for encoding."
);
}
}
})
.transpose()
})
.collect::<Result<StringArray>>()?;
}
}
None => {
builder.append_null();
}
}
}

let result = builder.finish();

Ok(Arc::new(result) as ArrayRef)
}
Expand All @@ -70,7 +83,7 @@ pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
| chr(Int64(128640)) |
+--------------------+
| 🚀 |
+--------------------+
+--------------------+
```"#,
standard_argument(name = "expression", prefix = "String"),
related_udf(name = "ascii")
Expand Down

0 comments on commit 6a036ae

Please sign in to comment.