From 5c0a33a0efca0f353be0e3001eef7dbc36a03622 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:19:44 +0100 Subject: [PATCH] feat: add `Series|Expr.rank` (#1342) * pandas int workaround * old pyarrow * fail pandas_pyarrow for pandas < (2,1) * xfail int only * fix options in over * merge main and better return docstring * float(nan) -> None * test eager only for rank --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 10 +++ narwhals/_arrow/series.py | 30 +++++++ narwhals/_pandas_like/expr.py | 24 +++++- narwhals/_pandas_like/series.py | 50 +++++++++++ narwhals/expr.py | 97 +++++++++++++++++++++ narwhals/series.py | 95 ++++++++++++++++++++ tests/expr_and_series/rank_test.py | 134 +++++++++++++++++++++++++++++ 9 files changed, 439 insertions(+), 3 deletions(-) create mode 100644 tests/expr_and_series/rank_test.py diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 299ab2d4a..e0f7b6578 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -47,6 +47,7 @@ - over - pipe - quantile + - rank - replace_strict - rolling_mean - rolling_std diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index c2e35a3c5..0aea494f7 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -54,6 +54,7 @@ - null_count - pipe - quantile + - rank - rename - replace_strict - rolling_mean diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index e511f405d..5ae6ce6b0 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -527,6 +527,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 046e26e05..1e8d09827 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1097,6 +1097,36 @@ def rolling_std( ** 0.5 ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + if method == "average": + msg = ( + "`rank` with `method='average' is not supported for pyarrow backend. " + "The available methods are {'min', 'max', 'dense', 'ordinal'}." + ) + raise ValueError(msg) + + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + + native_series = self._native_series + if self._backend_version < (14, 0, 0): # pragma: no cover + native_series = native_series.combine_chunks() + + null_mask = pc.is_null(native_series) + + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from ( maybe_extract_py_scalar(x, return_py_scalar=True) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c681fc487..fac9a2ed6 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -34,6 +34,7 @@ # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result "col->cum_count": "cumsum", "col->shift": "shift", + "col->rank": "rank", } @@ -383,7 +384,7 @@ def alias(self, name: str) -> Self: kwargs={**self._kwargs, "name": name}, ) - def over(self, keys: list[str]) -> Self: + def over(self: Self, keys: list[str]) -> Self: if self._function_name in MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -412,8 +413,15 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: if self._function_name == "col->shift": kwargs = {"periods": self._kwargs.get("n", 1)} - else: - # Cumulative operation + elif self._function_name == "col->rank": + _method = self._kwargs.get("method", "average") + kwargs = { + "method": "first" if _method == "ordinal" else _method, + "ascending": not self._kwargs.get("descending", False), + "na_option": "keep", + "pct": False, + } + else: # Cumulative operation kwargs = {"skipna": True} res_native = getattr( @@ -617,6 +625,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 60918fd2c..e5c5e771e 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -1119,6 +1119,56 @@ def is_finite(self: Self) -> Self: s = self._native_series return self._from_native_series((s > float("-inf")) & (s < float("inf"))) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + pd_method = "first" if method == "ordinal" else method + native_series = self._native_series + dtypes = import_dtypes_module(self._version) + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < (3,) + and self.dtype + in { + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + } + and (null_mask := native_series.isna()).any() + ): + # crazy workaround for the case of `na_option="keep"` and nullable + # integer dtypes. This should be supported in pandas > 3.0 + # https://github.com/pandas-dev/pandas/issues/56976 + ranked_series = ( + native_series.to_frame() + .assign(**{f"{native_series.name}_is_null": null_mask}) + .groupby(f"{native_series.name}_is_null") + .rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + )[native_series.name] + ) + + else: + ranked_series = native_series.rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + ) + + return self._from_native_series(ranked_series) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 3e457989a..809f76e77 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4114,6 +4114,103 @@ def rolling_std( ) ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new expression with rank data. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [3, 6, 1, 1, 6]} + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... result = df.with_columns(rnk=nw.col("a").rank(method="dense")) + ... return result.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.DataFrame(data)) + a rnk + 0 3 2.0 + 1 6 3.0 + 2 1 1.0 + 3 1 1.0 + 4 6 3.0 + + >>> agnostic_dense_rank(pl.DataFrame(data)) + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ rnk │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + │ 6 ┆ 3 │ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 6 ┆ 3 │ + └─────┴─────┘ + + >>> agnostic_dense_rank(pa.table(data)) + pyarrow.Table + a: int64 + rnk: uint64 + ---- + a: [[3,6,1,1,6]] + rnk: [[2,3,1,1,3]] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self.__class__( + lambda plx: self._to_compliant_expr(plx).rank( + method=method, descending=descending + ) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index c3e6f181b..7b4cfbf6e 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4738,6 +4738,101 @@ def __iter__(self: Self) -> Iterator[Any]: def __contains__(self: Self, other: Any) -> bool: return self._compliant_series.__contains__(other) # type: ignore[no-any-return] + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new series with rank data as values. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> + >>> data = [3, 6, 1, 1, 6] + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rank(method="dense").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.Series(data)) + 0 2.0 + 1 3.0 + 2 1.0 + 3 1.0 + 4 3.0 + dtype: float64 + + >>> agnostic_dense_rank(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 2 + 3 + 1 + 1 + 3 + ] + + >>> agnostic_dense_rank(pa.chunked_array([data])) # doctest:+ELLIPSIS + + [ + [ + 2, + 3, + 1, + 1, + 3 + ] + ] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self._from_compliant_series( + self._compliant_series.rank(method=method, descending=descending) + ) + @property def str(self: Self) -> SeriesStringNamespace[Self]: return SeriesStringNamespace(self) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py new file mode 100644 index 000000000..99a64371e --- /dev/null +++ b/tests/expr_and_series/rank_test.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from contextlib import nullcontext as does_not_raise +from typing import Literal + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +rank_methods = ["average", "min", "max", "dense", "ordinal"] + +data_int = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} +data_float = {"a": [3.1, 6.1, 1.5, 1.5, None, 6.1], "b": [1, 1, 2, 1, 2, 2]} + +expected = { + "average": [3.0, 4.5, 1.5, 1.5, None, 4.5], + "min": [3, 4, 1, 1, None, 4], + "max": [3, 5, 2, 2, None, 5], + "dense": [2, 3, 1, 1, None, 3], + "ordinal": [3, 4, 1, 2, None, 5], +} + +expected_over = { + "average": [2.0, 3.0, 1.0, 1.0, None, 2.0], + "min": [2, 3, 1, 1, None, 2], + "max": [2, 3, 1, 1, None, 2], + "dense": [2, 3, 1, 1, None, 2], + "ordinal": [2, 3, 1, 1, None, 2], +} + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_expr( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data)) + + result = df.select(nw.col("a").rank(method=method)) + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_series( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = {"a": df["a"].rank(method=method)} + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_expr_in_over_context( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + if any(x in str(constructor_eager) for x in ("pyarrow_table", "dask")): + # Pyarrow raises: + # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank + # We can handle that to provide a better error message. + request.applymarker(pytest.mark.xfail) + + if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): + request.applymarker(pytest.mark.xfail) + + df = nw.from_native(constructor_eager(data_float)) + + result = df.select(nw.col("a").rank(method=method).over("b")) + expected_data = {"a": expected_over[method]} + assert_equal_data(result, expected_data) + + +def test_invalid_method_raise(constructor_eager: ConstructorEager) -> None: + method = "invalid_method_name" + df = nw.from_native(constructor_eager(data_float)) + + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + + with pytest.raises(ValueError, match=msg): + df.select(nw.col("a").rank(method=method)) # type: ignore[arg-type] + + with pytest.raises(ValueError, match=msg): + df.lazy().collect()["a"].rank(method=method) # type: ignore[arg-type]