From 8ba78427ef2fea52ffabe91104b74b17906b3772 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Nov 2022 09:47:21 +0000 Subject: [PATCH] Faster BinaryArray to StringArray conversion (#3168) * Faster ByteArray to StringArray conversion * Add benchmark * Fix logical conflict --- arrow-array/src/array/string_array.rs | 16 +++++++++++++++- arrow/benches/array_data_validate.rs | 6 ++++++ arrow/src/row/mod.rs | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 8d92093f5ce8..fb3bb23179b5 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -216,8 +216,22 @@ impl From> for GenericStringArray { fn from(v: GenericBinaryArray) -> Self { + let offsets = v.value_offsets(); + let values = v.value_data(); + + // We only need to validate that all values are valid UTF-8 + let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence"); + for offset in offsets.iter() { + assert!( + validated.is_char_boundary(offset.as_usize()), + "Invalid UTF-8 sequence" + ) + } + let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - Self::from(builder.build().unwrap()) + // SAFETY: + // Validated UTF-8 above + Self::from(unsafe { builder.build_unchecked() }) } } diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 3cd13c09c58a..3b0fdbe63c97 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -52,6 +52,12 @@ fn validate_benchmark(c: &mut Criterion) { c.bench_function("validate_utf8_array_data 20000", |b| { b.iter(|| validate_utf8_array(&str_arr)) }); + + let byte_array = + BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); + c.bench_function("byte_array_to_string_array 20000", |b| { + b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone()))) + }); } criterion_group!(benches, validate_benchmark); diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 058c35869d20..6ce9f2b12c25 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -1425,7 +1425,7 @@ mod tests { } #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string")] + #[should_panic(expected = "Invalid UTF-8 sequence")] fn test_invalid_utf8() { let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();