From 9d7955ffe955ff89c61abf8703b1dc8495e11697 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 24 Jan 2025 01:18:10 +0100 Subject: [PATCH 1/5] refactor(rust): Remove last instances of itoa --- Cargo.lock | 12 +----------- Cargo.toml | 1 - crates/polars-arrow/Cargo.toml | 3 +-- crates/polars-arrow/src/compute/decimal.rs | 4 +--- crates/polars-time/Cargo.toml | 2 +- .../polars-time/src/chunkedarray/string/strptime.rs | 7 +++---- 6 files changed, 7 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5d1542a9fb99..23f5306f8a4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -220,15 +220,6 @@ dependencies = [ "syn 2.0.94", ] -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - [[package]] name = "atoi_simd" version = "0.16.0" @@ -2939,7 +2930,6 @@ version = "0.45.1" dependencies = [ "ahash", "async-stream", - "atoi", "atoi_simd", "avro-schema", "bytemuck", @@ -3505,7 +3495,7 @@ dependencies = [ name = "polars-time" version = "0.45.1" dependencies = [ - "atoi", + "atoi_simd", "bytemuck", "chrono", "chrono-tz", diff --git a/Cargo.toml b/Cargo.toml index 9ba205ecee31..41ff9b368a91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,6 @@ repository = "https://github.com/pola-rs/polars" ahash = ">=0.8.5" aho-corasick = "1.1" arboard = { version = "3.4.0", default-features = false } -atoi = "2" atoi_simd = "0.16" atomic-waker = "1" avro-schema = { version = "0.3" } diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 8fac057bd48f..8d1678ca5081 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -13,7 +13,6 @@ repository = { workspace = true } description = "Minimal implementation of the Arrow specification forked from arrow2" [dependencies] -atoi = { workspace = true, optional = true } bytemuck = { workspace = true, features = ["must_cast"] } chrono = { workspace = true } # for timezone support @@ -143,7 +142,7 @@ timezones = [ "chrono-tz", ] dtype-array = [] -dtype-decimal = ["atoi", "itoa"] +dtype-decimal = ["atoi_simd", "itoa"] bigidx = ["polars-utils/bigidx"] nightly = [] performant = [] diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index e5dc464f203b..85fcda7a1908 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -1,6 +1,5 @@ use std::sync::atomic::{AtomicBool, Ordering}; -use atoi::FromRadix10SignedChecked; use num_traits::Euclid; static TRIM_DECIMAL_ZEROS: AtomicBool = AtomicBool::new(false); @@ -26,8 +25,7 @@ fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { /// Parse a single i128 from bytes, ensuring the entire slice is read. fn parse_integer_checked(bytes: &[u8]) -> Option { - let (n, len) = i128::from_radix_10_signed_checked(bytes); - n.filter(|_| len == bytes.len()) + atoi_simd::parse_skipped(bytes).ok() } /// Assuming bytes are a well-formed decimal number (with or without a separator), diff --git a/crates/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml index d144b177174b..62577bdb41cf 100644 --- a/crates/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -16,7 +16,7 @@ polars-error = { workspace = true } polars-ops = { workspace = true } polars-utils = { workspace = true } -atoi = { workspace = true } +atoi_simd = { workspace = true } bytemuck = { workspace = true } chrono = { workspace = true } chrono-tz = { workspace = true, optional = true } diff --git a/crates/polars-time/src/chunkedarray/string/strptime.rs b/crates/polars-time/src/chunkedarray/string/strptime.rs index 04b6f4ebd315..229281a30dfa 100644 --- a/crates/polars-time/src/chunkedarray/string/strptime.rs +++ b/crates/polars-time/src/chunkedarray/string/strptime.rs @@ -1,6 +1,5 @@ //! Much more opinionated, but also much faster strptrime than the one given in Chrono. //! -use atoi::FromRadix10; use chrono::{NaiveDate, NaiveDateTime}; use once_cell::sync::Lazy; use regex::Regex; @@ -14,7 +13,7 @@ static TWELVE_HOUR_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?[Il]") static MERIDIEM_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?[pP]").unwrap()); #[inline] -fn update_and_parse( +fn update_and_parse( incr: usize, offset: usize, vals: &[u8], @@ -22,7 +21,7 @@ fn update_and_parse( // this maybe oob because we cannot entirely sure about fmt lengths let new_offset = offset + incr; let bytes = vals.get(offset..new_offset)?; - let (val, parsed) = T::from_radix_10(bytes); + let (val, parsed) = atoi_simd::parse_any(bytes).ok()?; if parsed == 0 { None } else { @@ -154,7 +153,7 @@ impl StrpTimeState { let new_offset = offset + 2; let bytes = val.get_unchecked(offset..new_offset); - let (decade, parsed) = i32::from_radix_10(bytes); + let (decade, parsed) = atoi_simd::parse_any::(bytes).ok()?; if parsed == 0 { return None; } From df79a1df85df4894a73267efe4950ec7df50b54e Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 24 Jan 2025 12:17:24 +0100 Subject: [PATCH 2/5] fix bug and simplify/robustify decimal parsing --- crates/polars-arrow/src/compute/decimal.rs | 144 +++++++++------------ 1 file changed, 59 insertions(+), 85 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 85fcda7a1908..2c709c70437a 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -11,102 +11,74 @@ pub fn set_trim_decimal_zeros(trim: Option) { TRIM_DECIMAL_ZEROS.store(trim.unwrap_or(false), Ordering::Relaxed) } -/// Count the number of b'0's at the beginning of a slice. -fn leading_zeros(bytes: &[u8]) -> u8 { - bytes.iter().take_while(|byte| **byte == b'0').count() as u8 -} - -fn split_decimal_bytes(bytes: &[u8]) -> (Option<&[u8]>, Option<&[u8]>) { - let mut a = bytes.splitn(2, |x| *x == b'.'); - let lhs = a.next(); - let rhs = a.next(); - (lhs, rhs) -} - -/// Parse a single i128 from bytes, ensuring the entire slice is read. -fn parse_integer_checked(bytes: &[u8]) -> Option { - atoi_simd::parse_skipped(bytes).ok() -} - -/// Assuming bytes are a well-formed decimal number (with or without a separator), -/// infer the scale of the number. If no separator is present, the scale is 0. -pub fn infer_scale(bytes: &[u8]) -> u8 { - let (_lhs, rhs) = split_decimal_bytes(bytes); - rhs.map_or(0, |x| x.len() as u8) -} - -/// Deserialize bytes to a single i128 representing a decimal, at a specified precision -/// (optional) and scale (required). If precision is not specified, it is assumed to be -/// 38 (the max precision allowed by the i128 representation). The number is checked to -/// ensure it fits within the specified precision and scale. Consistent with float parsing, -/// no decimal separator is required (eg "500", "500.", and "500.0" are all accepted); this allows -/// mixed integer/decimal sequences to be parsed as decimals. All trailing zeros are assumed to -/// be significant, whether or not a separator is present: 1200 requires precision >= 4, while 1200.200 -/// requires precision >= 7 and scale >= 3. Returns None if the number is not well-formed, or does not -/// fit. Only b'.' is allowed as a decimal separator (issue #6698). +/// Deserialize bytes to a single i128 representing a decimal, at a specified +/// precision (optional) and scale (required). The number is checked to ensure +/// it fits within the specified precision and scale. Consistent with float +/// parsing, no decimal separator is required (eg "500", "500.", and "500.0" are +/// all accepted); this allows mixed integer/decimal sequences to be parsed as +/// decimals. All trailing zeros are assumed to be significant, whether or not +/// a separator is present: 1200 requires precision >= 4, while 1200.200 +/// requires precision >= 7 and scale >= 3. Returns None if the number is not +/// well-formed, or does not fit. Only b'.' is allowed as a decimal separator +/// (issue #6698). #[inline] pub fn deserialize_decimal(mut bytes: &[u8], precision: Option, scale: u8) -> Option { - // While parse_integer_checked will parse positive/negative numbers, we want to - // handle the sign ourselves, and so check for it initially, then handle it - // at the end. + let precision_digits = precision.unwrap_or(38).min(38) as usize; + if scale as usize > precision_digits { + return None; + } + + let separator = bytes.iter().position(|b| *b == b'.').unwrap_or(bytes.len()); + let (mut int, mut frac) = bytes.split_at(separator); + if frac.len() <= 1 || scale == 0 { + // Only integer fast path. + let n: i128 = atoi_simd::parse(int).ok()?; + let ret = n.checked_mul(POW10[scale as usize] as i128)?; + if precision.is_some() && ret >= POW10[precision_digits] as i128 { + return None; + } + return Some(ret); + } + + // Skip period. + frac = &frac[1..]; + + // Skip sign. let negative = match bytes.first() { Some(s @ (b'+' | b'-')) => { - bytes = &bytes[1..]; + int = &int[1..]; *s == b'-' }, _ => false, }; - let (lhs, rhs) = split_decimal_bytes(bytes); - let precision = precision.unwrap_or(38); - - let lhs_b = lhs?; - - // For the purposes of decimal parsing, we assume that all digits other than leading zeros - // are significant, eg, 001200 has 4 significant digits, not 2. The Decimal type does - // not allow negative scales, so all trailing zeros on the LHS of any decimal separator - // will still take up space in the representation (eg, 1200 requires, at minimum, precision 4 - // at scale 0; there is no scale -2 where it would only need precision 2). - let lhs_s = lhs_b.len() as u8 - leading_zeros(lhs_b); - - if lhs_s + scale > precision { - // the integer already exceeds the precision + + // Truncate trailing digits that extend beyond the scale. + let frac_scale = if scale as usize <= frac.len() { + frac = &frac[..scale as usize]; + 0 + } else { + scale as usize - frac.len() + }; + + // Parse and combine parts. + let pint: u128 = if int.is_empty() { 0 } else { atoi_simd::parse_pos(int).ok()? }; + let pfrac: u128 = atoi_simd::parse_pos(frac).ok()?; + + let ret = pint.checked_mul(POW10[scale as usize])?.checked_add( + pfrac.checked_mul(POW10[frac_scale])?)?; + if precision.is_some() && ret >= POW10[precision_digits] { return None; } - - let abs = parse_integer_checked(lhs_b).and_then(|x| match rhs { - // A decimal separator was found, so LHS and RHS need to be combined. - Some(mut rhs) => { - if matches!(rhs.first(), Some(b'+' | b'-')) { - // RHS starts with a '+'/'-' sign and the number is not well-formed. - return None; - } - let scale_adjust = if (scale as usize) <= rhs.len() { - // Truncate trailing digits that extend beyond the scale - rhs = &rhs[..scale as usize]; - None - } else { - Some(scale as u32 - rhs.len() as u32) - }; - - parse_integer_checked(rhs).map(|y| { - let lhs = x * 10i128.pow(scale as u32); - let rhs = scale_adjust.map_or(y, |s| y * 10i128.pow(s)); - lhs + rhs - }) - }, - // No decimal separator was found; we have an integer / LHS only. - None => { - if lhs_b.is_empty() { - // we simply have no number at all / an empty string. - return None; - } - Some(x * 10i128.pow(scale as u32)) - }, - }); if negative { - Some(-abs?) + if ret == (1 << 127) { + // Has to be handled separately, 2^127 does not fit in i128, but + // does negated. + Some(i128::MIN) + } else { + ret.try_into().ok().map(|n: i128| -n) + } } else { - abs + ret.try_into().ok() } } @@ -320,6 +292,8 @@ mod test { assert_eq!(deserialize_decimal(val, Some(5), 6), None); // insufficient precision, excess scale assert_eq!(deserialize_decimal(val, Some(5), 3), None); // insufficient precision, exact scale assert_eq!(deserialize_decimal(val, Some(12), 5), Some(120001000)); // excess precision, excess scale - assert_eq!(deserialize_decimal(val, None, 35), None); // scale causes insufficient precision + assert_eq!(deserialize_decimal(val, None, 35), Some(120001000000000000000000000000000000000)); + assert_eq!(deserialize_decimal(val, None, 36), None); + assert_eq!(deserialize_decimal(val, Some(38), 35), None); // scale causes insufficient precision } } From 5d8d926b6a5d6365554cf91620ed4e58f23ca00b Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 24 Jan 2025 12:20:55 +0100 Subject: [PATCH 3/5] simplify negation --- crates/polars-arrow/src/compute/decimal.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 2c709c70437a..9122fe10bedc 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -70,12 +70,10 @@ pub fn deserialize_decimal(mut bytes: &[u8], precision: Option, scale: u8) - return None; } if negative { - if ret == (1 << 127) { - // Has to be handled separately, 2^127 does not fit in i128, but - // does negated. - Some(i128::MIN) + if ret > (1 << 127) { + None } else { - ret.try_into().ok().map(|n: i128| -n) + Some(ret.wrapping_neg() as i128) } } else { ret.try_into().ok() From fa0c54a2109633a764575a624909459eabcb841b Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 24 Jan 2025 12:35:15 +0100 Subject: [PATCH 4/5] re-introduce infer_scale (was used elsewhere) --- crates/polars-arrow/src/compute/decimal.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 9122fe10bedc..601a7a055cc4 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -11,6 +11,13 @@ pub fn set_trim_decimal_zeros(trim: Option) { TRIM_DECIMAL_ZEROS.store(trim.unwrap_or(false), Ordering::Relaxed) } +/// Assuming bytes are a well-formed decimal number (with or without a separator), +/// infer the scale of the number. If no separator is present, the scale is 0. +pub fn infer_scale(bytes: &[u8]) -> u8 { + let Some(separator) = bytes.iter().position(|b| *b == b'.') else { return 0; }; + (bytes.len() - (1 + separator)) as u8 +} + /// Deserialize bytes to a single i128 representing a decimal, at a specified /// precision (optional) and scale (required). The number is checked to ensure /// it fits within the specified precision and scale. Consistent with float @@ -22,7 +29,7 @@ pub fn set_trim_decimal_zeros(trim: Option) { /// well-formed, or does not fit. Only b'.' is allowed as a decimal separator /// (issue #6698). #[inline] -pub fn deserialize_decimal(mut bytes: &[u8], precision: Option, scale: u8) -> Option { +pub fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Option { let precision_digits = precision.unwrap_or(38).min(38) as usize; if scale as usize > precision_digits { return None; From 58a2fd5ee606261e08cf01950375047198bcc9fd Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 24 Jan 2025 13:05:51 +0100 Subject: [PATCH 5/5] fmt --- crates/polars-arrow/src/compute/decimal.rs | 30 ++++++++++++++-------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/crates/polars-arrow/src/compute/decimal.rs b/crates/polars-arrow/src/compute/decimal.rs index 601a7a055cc4..6145d0985f56 100644 --- a/crates/polars-arrow/src/compute/decimal.rs +++ b/crates/polars-arrow/src/compute/decimal.rs @@ -14,7 +14,9 @@ pub fn set_trim_decimal_zeros(trim: Option) { /// Assuming bytes are a well-formed decimal number (with or without a separator), /// infer the scale of the number. If no separator is present, the scale is 0. pub fn infer_scale(bytes: &[u8]) -> u8 { - let Some(separator) = bytes.iter().position(|b| *b == b'.') else { return 0; }; + let Some(separator) = bytes.iter().position(|b| *b == b'.') else { + return 0; + }; (bytes.len() - (1 + separator)) as u8 } @@ -34,7 +36,7 @@ pub fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Op if scale as usize > precision_digits { return None; } - + let separator = bytes.iter().position(|b| *b == b'.').unwrap_or(bytes.len()); let (mut int, mut frac) = bytes.split_at(separator); if frac.len() <= 1 || scale == 0 { @@ -48,7 +50,7 @@ pub fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Op } // Skip period. - frac = &frac[1..]; + frac = &frac[1..]; // Skip sign. let negative = match bytes.first() { @@ -58,7 +60,7 @@ pub fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Op }, _ => false, }; - + // Truncate trailing digits that extend beyond the scale. let frac_scale = if scale as usize <= frac.len() { frac = &frac[..scale as usize]; @@ -66,13 +68,18 @@ pub fn deserialize_decimal(bytes: &[u8], precision: Option, scale: u8) -> Op } else { scale as usize - frac.len() }; - + // Parse and combine parts. - let pint: u128 = if int.is_empty() { 0 } else { atoi_simd::parse_pos(int).ok()? }; + let pint: u128 = if int.is_empty() { + 0 + } else { + atoi_simd::parse_pos(int).ok()? + }; let pfrac: u128 = atoi_simd::parse_pos(frac).ok()?; - - let ret = pint.checked_mul(POW10[scale as usize])?.checked_add( - pfrac.checked_mul(POW10[frac_scale])?)?; + + let ret = pint + .checked_mul(POW10[scale as usize])? + .checked_add(pfrac.checked_mul(POW10[frac_scale])?)?; if precision.is_some() && ret >= POW10[precision_digits] { return None; } @@ -297,7 +304,10 @@ mod test { assert_eq!(deserialize_decimal(val, Some(5), 6), None); // insufficient precision, excess scale assert_eq!(deserialize_decimal(val, Some(5), 3), None); // insufficient precision, exact scale assert_eq!(deserialize_decimal(val, Some(12), 5), Some(120001000)); // excess precision, excess scale - assert_eq!(deserialize_decimal(val, None, 35), Some(120001000000000000000000000000000000000)); + assert_eq!( + deserialize_decimal(val, None, 35), + Some(120001000000000000000000000000000000000) + ); assert_eq!(deserialize_decimal(val, None, 36), None); assert_eq!(deserialize_decimal(val, Some(38), 35), None); // scale causes insufficient precision }