Skip to content

Commit

Permalink
feat: add Series|Expr.rank (#1342)
Browse files Browse the repository at this point in the history
* pandas int workaround

* old pyarrow

* fail pandas_pyarrow for pandas < (2,1)

* xfail int only

* fix options in over

* merge main and better return docstring

* float(nan) -> None

* test eager only for rank
  • Loading branch information
FBruzzesi authored Jan 7, 2025
1 parent 9a62d90 commit 5c0a33a
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
- over
- pipe
- quantile
- rank
- replace_strict
- rolling_mean
- rolling_std
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
- null_count
- pipe
- quantile
- rank
- rename
- replace_strict
- rolling_mean
Expand Down
10 changes: 10 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,16 @@ def rolling_std(
ddof=ddof,
)

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
return reuse_series_implementation(
self, "rank", method=method, descending=descending
)

@property
def dt(self: Self) -> ArrowExprDateTimeNamespace:
return ArrowExprDateTimeNamespace(self)
Expand Down
30 changes: 30 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,36 @@ def rolling_std(
** 0.5
)

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
if method == "average":
msg = (
"`rank` with `method='average' is not supported for pyarrow backend. "
"The available methods are {'min', 'max', 'dense', 'ordinal'}."
)
raise ValueError(msg)

import pyarrow as pa # ignore-banned-import
import pyarrow.compute as pc # ignore-banned-import

sort_keys = "descending" if descending else "ascending"
tiebreaker = "first" if method == "ordinal" else method

native_series = self._native_series
if self._backend_version < (14, 0, 0): # pragma: no cover
native_series = native_series.combine_chunks()

null_mask = pc.is_null(native_series)

rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker)

result = pc.if_else(null_mask, pa.scalar(None), rank)
return self._from_native_series(result)

def __iter__(self: Self) -> Iterator[Any]:
yield from (
maybe_extract_py_scalar(x, return_py_scalar=True)
Expand Down
24 changes: 21 additions & 3 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
# So, instead of using "cumcount" we use "cumsum" on notna() to get the same result
"col->cum_count": "cumsum",
"col->shift": "shift",
"col->rank": "rank",
}


Expand Down Expand Up @@ -383,7 +384,7 @@ def alias(self, name: str) -> Self:
kwargs={**self._kwargs, "name": name},
)

def over(self, keys: list[str]) -> Self:
def over(self: Self, keys: list[str]) -> Self:
if self._function_name in MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT:

def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
Expand Down Expand Up @@ -412,8 +413,15 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:

if self._function_name == "col->shift":
kwargs = {"periods": self._kwargs.get("n", 1)}
else:
# Cumulative operation
elif self._function_name == "col->rank":
_method = self._kwargs.get("method", "average")
kwargs = {
"method": "first" if _method == "ordinal" else _method,
"ascending": not self._kwargs.get("descending", False),
"na_option": "keep",
"pct": False,
}
else: # Cumulative operation
kwargs = {"skipna": True}

res_native = getattr(
Expand Down Expand Up @@ -617,6 +625,16 @@ def rolling_std(
ddof=ddof,
)

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
return reuse_series_implementation(
self, "rank", method=method, descending=descending
)

@property
def str(self: Self) -> PandasLikeExprStringNamespace:
return PandasLikeExprStringNamespace(self)
Expand Down
50 changes: 50 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,56 @@ def is_finite(self: Self) -> Self:
s = self._native_series
return self._from_native_series((s > float("-inf")) & (s < float("inf")))

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
pd_method = "first" if method == "ordinal" else method
native_series = self._native_series
dtypes = import_dtypes_module(self._version)
if (
self._implementation is Implementation.PANDAS
and self._backend_version < (3,)
and self.dtype
in {
dtypes.Int64,
dtypes.Int32,
dtypes.Int16,
dtypes.Int8,
dtypes.UInt64,
dtypes.UInt32,
dtypes.UInt16,
dtypes.UInt8,
}
and (null_mask := native_series.isna()).any()
):
# crazy workaround for the case of `na_option="keep"` and nullable
# integer dtypes. This should be supported in pandas > 3.0
# https://github.com/pandas-dev/pandas/issues/56976
ranked_series = (
native_series.to_frame()
.assign(**{f"{native_series.name}_is_null": null_mask})
.groupby(f"{native_series.name}_is_null")
.rank(
method=pd_method,
na_option="keep",
ascending=not descending,
pct=False,
)[native_series.name]
)

else:
ranked_series = native_series.rank(
method=pd_method,
na_option="keep",
ascending=not descending,
pct=False,
)

return self._from_native_series(ranked_series)

@property
def str(self) -> PandasLikeSeriesStringNamespace:
return PandasLikeSeriesStringNamespace(self)
Expand Down
97 changes: 97 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4114,6 +4114,103 @@ def rolling_std(
)
)

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
*,
descending: bool = False,
) -> Self:
"""Assign ranks to data, dealing with ties appropriately.
Notes:
The resulting dtype may differ between backends.
Arguments:
method: The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):
- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to the
order that the values occur in the Series.
descending: Rank in descending order.
Returns:
A new expression with rank data.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"a": [3, 6, 1, 1, 6]}
We define a dataframe-agnostic function that computes the dense rank for
the data:
>>> def agnostic_dense_rank(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... result = df.with_columns(rnk=nw.col("a").rank(method="dense"))
... return result.to_native()
We can then pass any supported library such as pandas, Polars, or
PyArrow to `agnostic_dense_rank`:
>>> agnostic_dense_rank(pd.DataFrame(data))
a rnk
0 3 2.0
1 6 3.0
2 1 1.0
3 1 1.0
4 6 3.0
>>> agnostic_dense_rank(pl.DataFrame(data))
shape: (5, 2)
┌─────┬─────┐
│ a ┆ rnk │
│ --- ┆ --- │
│ i64 ┆ u32 │
╞═════╪═════╡
│ 3 ┆ 2 │
│ 6 ┆ 3 │
│ 1 ┆ 1 │
│ 1 ┆ 1 │
│ 6 ┆ 3 │
└─────┴─────┘
>>> agnostic_dense_rank(pa.table(data))
pyarrow.Table
a: int64
rnk: uint64
----
a: [[3,6,1,1,6]]
rnk: [[2,3,1,1,3]]
"""
supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
if method not in supported_rank_methods:
msg = (
"Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. "
f"Found '{method}'"
)
raise ValueError(msg)

return self.__class__(
lambda plx: self._to_compliant_expr(plx).rank(
method=method, descending=descending
)
)

@property
def str(self: Self) -> ExprStringNamespace[Self]:
return ExprStringNamespace(self)
Expand Down
95 changes: 95 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4738,6 +4738,101 @@ def __iter__(self: Self) -> Iterator[Any]:
def __contains__(self: Self, other: Any) -> bool:
return self._compliant_series.__contains__(other) # type: ignore[no-any-return]

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
*,
descending: bool = False,
) -> Self:
"""Assign ranks to data, dealing with ties appropriately.
Notes:
The resulting dtype may differ between backends.
Arguments:
method: The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):
- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to the
order that the values occur in the Series.
descending: Rank in descending order.
Returns:
A new series with rank data as values.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT
>>>
>>> data = [3, 6, 1, 1, 6]
We define a dataframe-agnostic function that computes the dense rank for
the data:
>>> def agnostic_dense_rank(s_native: IntoSeriesT) -> IntoSeriesT:
... s = nw.from_native(s_native, series_only=True)
... return s.rank(method="dense").to_native()
We can then pass any supported library such as pandas, Polars, or
PyArrow to `agnostic_dense_rank`:
>>> agnostic_dense_rank(pd.Series(data))
0 2.0
1 3.0
2 1.0
3 1.0
4 3.0
dtype: float64
>>> agnostic_dense_rank(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE
shape: (5,)
Series: '' [u32]
[
2
3
1
1
3
]
>>> agnostic_dense_rank(pa.chunked_array([data])) # doctest:+ELLIPSIS
<pyarrow.lib.ChunkedArray object at ...>
[
[
2,
3,
1,
1,
3
]
]
"""
supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
if method not in supported_rank_methods:
msg = (
"Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. "
f"Found '{method}'"
)
raise ValueError(msg)

return self._from_compliant_series(
self._compliant_series.rank(method=method, descending=descending)
)

@property
def str(self: Self) -> SeriesStringNamespace[Self]:
return SeriesStringNamespace(self)
Expand Down
Loading

0 comments on commit 5c0a33a

Please sign in to comment.