From d5a026455eebba68432339e6dcb8c4b8acecf4a9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Nov 2024 19:06:25 -0500 Subject: [PATCH 1/2] fix: regression in `hist` panicking --- crates/polars-ops/src/chunked_array/hist.rs | 14 ++++---------- py-polars/tests/unit/operations/test_hist.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index 16ae776cae4a..87dc77429386 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -93,7 +93,6 @@ where let min_break: f64 = breaks[0]; let max_break: f64 = breaks[num_bins]; let width = breaks[1] - min_break; // guaranteed at least one bin - let is_integer = !T::get_dtype().is_float(); for chunk in ca.downcast_iter() { for item in chunk.non_null_values_iter() { @@ -101,15 +100,10 @@ where if include_lower && item == min_break { count[0] += 1; } else if item > min_break && item <= max_break { - let idx = (item - min_break) / width; - // This is needed for numeric stability for integers. - // We can fall directly on a boundary with an integer. - let idx = if is_integer && (idx.round() - idx).abs() < 0.0000001 { - idx.round() - 1.0 - } else { - idx.ceil() - 1.0 - }; - count[idx as usize] += 1; + let idx = ((item - min_break) / width).floor() as usize; + // handle the case where item lands on the max_break boundary + let idx = if idx == num_bins { idx - 1 } else { idx }; + count[idx] += 1; } } } diff --git a/py-polars/tests/unit/operations/test_hist.py b/py-polars/tests/unit/operations/test_hist.py index f3f52e2cfa92..eec0dd263b7f 100644 --- a/py-polars/tests/unit/operations/test_hist.py +++ b/py-polars/tests/unit/operations/test_hist.py @@ -410,3 +410,16 @@ def test_hist_floating_point() -> None: upper = bp[i] assert ((s <= upper) & (s > lower)).sum() == count[i] + + +def test_hist_max_boundary_19998() -> None: + s = pl.Series( + [ + 9514.988509739183, + 30738.098872148617, + 41400.15705103004, + 49093.06982022727, + ] + ) + result = s.hist(bin_count=50) + assert result["count"].sum() == 4 From 5d9595d2197f12aba9658a9e1a3600c392a7a8f4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Nov 2024 20:29:25 -0500 Subject: [PATCH 2/2] fix --- crates/polars-ops/src/chunked_array/hist.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index 87dc77429386..27ab0df27798 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -99,11 +99,18 @@ where let item = item.to_f64().unwrap(); if include_lower && item == min_break { count[0] += 1; - } else if item > min_break && item <= max_break { - let idx = ((item - min_break) / width).floor() as usize; - // handle the case where item lands on the max_break boundary - let idx = if idx == num_bins { idx - 1 } else { idx }; - count[idx] += 1; + } else if item == max_break { + count[num_bins - 1] += 1; + } else if item > min_break && item < max_break { + let width_multiple = (item - min_break) / width; + let idx = width_multiple.floor(); + // handle the case where item lands on the boundary + let idx = if idx == width_multiple { + idx - 1.0 + } else { + idx + }; + count[idx as usize] += 1; } } }