From cd7504933c48cc009a887c8d6ea4fb6a837dad30 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Mon, 24 Jan 2022 16:13:35 +0300 Subject: [PATCH] fix: substr - correct behaivour with negative start pos --- datafusion/src/physical_plan/functions.rs | 67 +++++++++++++++++++ .../src/physical_plan/unicode_expressions.rs | 28 +++++--- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index ccd355d39b7e4..47cda3e5f05dd 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -3582,6 +3582,18 @@ mod tests { StringArray ); #[cfg(feature = "unicode_expressions")] + test_function!( + Substr, + &[ + lit(ScalarValue::Utf8(Some("joséésoj".to_string()))), + lit(ScalarValue::Int64(Some(-5))), + ], + Ok(Some("joséésoj")), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] test_function!( Substr, &[ @@ -3680,6 +3692,61 @@ mod tests { StringArray ); #[cfg(feature = "unicode_expressions")] + test_function!( + Substr, + &[ + lit(ScalarValue::Utf8(Some("alphabet".to_string()))), + lit(ScalarValue::Int64(Some(0))), + lit(ScalarValue::Int64(Some(5))), + ], + Ok(Some("alph")), + &str, + Utf8, + StringArray + ); + // starting from 5 (10 + -5) + #[cfg(feature = "unicode_expressions")] + test_function!( + Substr, + &[ + lit(ScalarValue::Utf8(Some("alphabet".to_string()))), + lit(ScalarValue::Int64(Some(-5))), + lit(ScalarValue::Int64(Some(10))), + ], + Ok(Some("alph")), + &str, + Utf8, + StringArray + ); + // starting from -1 (4 + -5) + #[cfg(feature = "unicode_expressions")] + test_function!( + Substr, + &[ + lit(ScalarValue::Utf8(Some("alphabet".to_string()))), + lit(ScalarValue::Int64(Some(-5))), + lit(ScalarValue::Int64(Some(4))), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + // starting from 0 (5 + -5) + #[cfg(feature = "unicode_expressions")] + test_function!( + Substr, + &[ + lit(ScalarValue::Utf8(Some("alphabet".to_string()))), + lit(ScalarValue::Int64(Some(-5))), + lit(ScalarValue::Int64(Some(5))), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] test_function!( Substr, &[ diff --git a/datafusion/src/physical_plan/unicode_expressions.rs b/datafusion/src/physical_plan/unicode_expressions.rs index 3852fd7c931fa..3982f3ea0e1d6 100644 --- a/datafusion/src/physical_plan/unicode_expressions.rs +++ b/datafusion/src/physical_plan/unicode_expressions.rs @@ -455,21 +455,29 @@ pub fn substr(args: &[ArrayRef]) -> Result { Err(DataFusionError::Execution( "negative substring length not allowed".to_string(), )) - } else if start <= 0 { - Ok(Some(string.to_string())) } else { let graphemes = string.graphemes(true).collect::>(); - let start_pos = start as usize - 1; - let count_usize = count as usize; - if graphemes.len() < start_pos { + let (start_pos, end_pos) = if start <= 0 { + let end_pos = start + count - 1; + ( + 0 as usize, + if end_pos < 0 { + // we use 0 as workaround for usize to return empty string + 0 + } else { + end_pos as usize + }, + ) + } else { + ((start - 1) as usize, (start + count - 1) as usize) + }; + + if end_pos == 0 || graphemes.len() < start_pos { Ok(Some("".to_string())) - } else if graphemes.len() < start_pos + count_usize { + } else if graphemes.len() < end_pos { Ok(Some(graphemes[start_pos..].concat())) } else { - Ok(Some( - graphemes[start_pos..start_pos + count_usize] - .concat(), - )) + Ok(Some(graphemes[start_pos..end_pos].concat())) } } }