15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- use std:: { cmp:: min, sync:: Arc } ;
19
-
20
18
use arrow:: {
21
19
array:: {
22
- ArrayRef , AsArray , Decimal128Builder , Float32Array , Float64Array , GenericStringArray ,
23
- Int16Array , Int32Array , Int64Array , Int64Builder , Int8Array , OffsetSizeTrait ,
20
+ ArrayRef , AsArray , Decimal128Builder , Float32Array , Float64Array , Int16Array , Int32Array ,
21
+ Int64Array , Int64Builder , Int8Array , OffsetSizeTrait ,
24
22
} ,
25
23
datatypes:: { validate_decimal_precision, Decimal128Type , Int64Type } ,
26
24
} ;
25
+ use arrow_array:: builder:: GenericStringBuilder ;
27
26
use arrow_array:: { Array , ArrowNativeTypeOp , BooleanArray , Decimal128Array } ;
28
27
use arrow_schema:: { DataType , DECIMAL128_MAX_PRECISION } ;
29
28
use datafusion:: { functions:: math:: round:: round, physical_plan:: ColumnarValue } ;
@@ -35,7 +34,8 @@ use num::{
35
34
integer:: { div_ceil, div_floor} ,
36
35
BigInt , Signed , ToPrimitive ,
37
36
} ;
38
- use unicode_segmentation:: UnicodeSegmentation ;
37
+ use std:: fmt:: Write ;
38
+ use std:: { cmp:: min, sync:: Arc } ;
39
39
40
40
mod unhex;
41
41
pub use unhex:: spark_unhex;
@@ -387,52 +387,54 @@ pub fn spark_round(
387
387
}
388
388
389
389
/// Similar to DataFusion `rpad`, but not to truncate when the string is already longer than length
390
- pub fn spark_rpad ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue , DataFusionError > {
390
+ pub fn spark_read_side_padding ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue , DataFusionError > {
391
391
match args {
392
392
[ ColumnarValue :: Array ( array) , ColumnarValue :: Scalar ( ScalarValue :: Int32 ( Some ( length) ) ) ] => {
393
- match args [ 0 ] . data_type ( ) {
394
- DataType :: Utf8 => spark_rpad_internal :: < i32 > ( array, * length) ,
395
- DataType :: LargeUtf8 => spark_rpad_internal :: < i64 > ( array, * length) ,
393
+ match array . data_type ( ) {
394
+ DataType :: Utf8 => spark_read_side_padding_internal :: < i32 > ( array, * length) ,
395
+ DataType :: LargeUtf8 => spark_read_side_padding_internal :: < i64 > ( array, * length) ,
396
396
// TODO: handle Dictionary types
397
397
other => Err ( DataFusionError :: Internal ( format ! (
398
- "Unsupported data type {other:?} for function rpad " ,
398
+ "Unsupported data type {other:?} for function read_side_padding " ,
399
399
) ) ) ,
400
400
}
401
401
}
402
402
other => Err ( DataFusionError :: Internal ( format ! (
403
- "Unsupported arguments {other:?} for function rpad " ,
403
+ "Unsupported arguments {other:?} for function read_side_padding " ,
404
404
) ) ) ,
405
405
}
406
406
}
407
407
408
- fn spark_rpad_internal < T : OffsetSizeTrait > (
408
+ fn spark_read_side_padding_internal < T : OffsetSizeTrait > (
409
409
array : & ArrayRef ,
410
410
length : i32 ,
411
411
) -> Result < ColumnarValue , DataFusionError > {
412
412
let string_array = as_generic_string_array :: < T > ( array) ?;
413
+ let length = 0 . max ( length) as usize ;
414
+ let space_string = " " . repeat ( length) ;
415
+
416
+ let mut builder =
417
+ GenericStringBuilder :: < T > :: with_capacity ( string_array. len ( ) , string_array. len ( ) * length) ;
413
418
414
- let result = string_array
415
- . iter ( )
416
- . map ( |string| match string {
419
+ for string in string_array. iter ( ) {
420
+ match string {
417
421
Some ( string) => {
418
- let length = if length < 0 { 0 } else { length as usize } ;
419
- if length == 0 {
420
- Ok ( Some ( "" . to_string ( ) ) )
422
+ // It looks Spark's UTF8String is closer to chars rather than graphemes
423
+ // https://stackoverflow.com/a/46290728
424
+ let char_len = string. chars ( ) . count ( ) ;
425
+ if length <= char_len {
426
+ builder. append_value ( string) ;
421
427
} else {
422
- let graphemes = string. graphemes ( true ) . collect :: < Vec < & str > > ( ) ;
423
- if length < graphemes. len ( ) {
424
- Ok ( Some ( string. to_string ( ) ) )
425
- } else {
426
- let mut s = string. to_string ( ) ;
427
- s. push_str ( " " . repeat ( length - graphemes. len ( ) ) . as_str ( ) ) ;
428
- Ok ( Some ( s) )
429
- }
428
+ // write_str updates only the value buffer, not null nor offset buffer
429
+ // This is convenient for concatenating str(s)
430
+ builder. write_str ( string) ?;
431
+ builder. append_value ( & space_string[ char_len..] ) ;
430
432
}
431
433
}
432
- _ => Ok ( None ) ,
433
- } )
434
- . collect :: < Result < GenericStringArray < T > , DataFusionError > > ( ) ? ;
435
- Ok ( ColumnarValue :: Array ( Arc :: new ( result ) ) )
434
+ _ => builder . append_null ( ) ,
435
+ }
436
+ }
437
+ Ok ( ColumnarValue :: Array ( Arc :: new ( builder . finish ( ) ) ) )
436
438
}
437
439
438
440
// Let Decimal(p3, s3) as return type i.e. Decimal(p1, s1) / Decimal(p2, s2) = Decimal(p3, s3).
0 commit comments