Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add more parameters to the fill_null method #1149

Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,15 @@ def sample(
seed=seed,
)

def fill_null(self: Self, value: Any) -> Self:
return reuse_series_implementation(self, "fill_null", value=value)
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
return reuse_series_implementation(
self, "fill_null", value=value, strategy=strategy, limit=limit
)

def is_duplicated(self: Self) -> Self:
return reuse_series_implementation(self, "is_duplicated")
Expand Down
47 changes: 45 additions & 2 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,14 +568,57 @@ def sample(

return self._from_native_series(pc.take(ser, mask))

def fill_null(self: Self, value: Any) -> Self:
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
import numpy as np # ignore-banned-import
import pyarrow as pa # ignore-banned-import()
import pyarrow.compute as pc # ignore-banned-import()

def fill_aux(
IsaiasGutierrezCruz marked this conversation as resolved.
Show resolved Hide resolved
arr: pa.Array,
limit: int,
direction: Literal["forward", "backward"] | None = None,
) -> pa.Array:
# this algorithm first finds the indices of the valid values to fill all the null value positions
# then it calculates the distance of each new index and the original index
# if the distance is equal to or less than the limit and the original value is null, it is replaced
valid_mask = pc.is_valid(arr)
indices = pa.array(np.arange(len(arr)), type=pa.int64())
if direction == "forward":
valid_index = np.maximum.accumulate(np.where(valid_mask, indices, -1))
distance = indices - valid_index
else:
valid_index = np.minimum.accumulate(
np.where(valid_mask[::-1], indices[::-1], len(arr))
)[::-1]
distance = valid_index - indices
return pc.if_else(
pc.and_(
pc.is_null(arr),
pc.less_equal(distance, pa.scalar(limit)),
),
arr.take(valid_index),
arr,
)

ser = self._native_series
dtype = ser.type

return self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
if value is not None:
res_ser = self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
elif limit is None:
fill_func = (
pc.fill_null_forward if strategy == "forward" else pc.fill_null_backward
)
res_ser = self._from_native_series(fill_func(ser))
else:
res_ser = self._from_native_series(fill_aux(ser, limit, strategy))

return res_ser

def to_frame(self: Self) -> ArrowDataFrame:
import pyarrow as pa # ignore-banned-import()
Expand Down
27 changes: 25 additions & 2 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,11 +505,34 @@ def any(self) -> Self:
returns_scalar=True,
)

def fill_null(self, value: Any) -> DaskExpr:
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> DaskExpr:
def func(
_input: dask_expr.Series,
value: Any | None,
strategy: str | None,
limit: int | None,
) -> dask_expr.Series:
if value is not None:
res_ser = _input.fillna(value)
else:
res_ser = (
_input.ffill(limit=limit)
if strategy == "forward"
else _input.bfill(limit=limit)
)
return res_ser

return self._from_call(
lambda _input, _val: _input.fillna(_val),
func,
"fillna",
value,
strategy,
limit,
returns_scalar=False,
)

Expand Down
11 changes: 9 additions & 2 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,15 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def fill_null(self, value: Any) -> Self:
return reuse_series_implementation(self, "fill_null", value=value)
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
return reuse_series_implementation(
self, "fill_null", value=value, strategy=strategy, limit=limit
)

def is_in(self, other: Any) -> Self:
return reuse_series_implementation(self, "is_in", other=other)
Expand Down
18 changes: 16 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,23 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def fill_null(self, value: Any) -> PandasLikeSeries:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
IsaiasGutierrezCruz marked this conversation as resolved.
Show resolved Hide resolved
) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.fillna(value))
if value is not None:
res_ser = self._from_native_series(ser.fillna(value=value))
else:
res_ser = self._from_native_series(
ser.ffill(limit=limit)
if strategy == "forward"
else ser.bfill(limit=limit)
)

return res_ser

def drop_nulls(self) -> PandasLikeSeries:
ser = self._native_series
Expand Down
102 changes: 92 additions & 10 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,10 +1337,22 @@ def arg_true(self) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).arg_true())

def fill_null(self, value: Any) -> Self:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
IsaiasGutierrezCruz marked this conversation as resolved.
Show resolved Hide resolved
"""
Fill null values with given value.

Arguments:
value: Value used to fill null values.

strategy: Strategy used to fill null values.

limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.

Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Expand All @@ -1351,13 +1363,22 @@ def fill_null(self, value: Any) -> Self:
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )
>>> df_pl = pl.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )
>>> df_pa = pa.table(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )

Let's define a dataframe-agnostic function:
Expand All @@ -1373,11 +1394,12 @@ def fill_null(self, value: Any) -> Self:
0 2.0 2.0
1 4.0 4.0
2 0.0 0.0
3 3.0 3.0
4 5.0 5.0
3 0.0 0.0
4 3.0 3.0
5 5.0 5.0

>>> func(df_pl) # nan != null for polars
shape: (5, 2)
shape: (6, 2)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
β”‚ a ┆ b β”‚
β”‚ --- ┆ --- β”‚
Expand All @@ -1386,6 +1408,7 @@ def fill_null(self, value: Any) -> Self:
β”‚ 2 ┆ 2.0 β”‚
β”‚ 4 ┆ 4.0 β”‚
β”‚ 0 ┆ NaN β”‚
β”‚ 0 ┆ NaN β”‚
β”‚ 3 ┆ 3.0 β”‚
β”‚ 5 ┆ 5.0 β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
Expand All @@ -1395,10 +1418,69 @@ def fill_null(self, value: Any) -> Self:
a: int64
b: double
----
a: [[2,4,0,3,5]]
b: [[2,4,nan,3,5]]
a: [[2,4,0,0,3,5]]
b: [[2,4,nan,nan,3,5]]

Using a strategy:

>>> @nw.narwhalify
... def func_strategies(df):
... return df.with_columns(
... nw.col("a", "b")
... .fill_null(strategy="forward", limit=1)
... .name.suffix("_filled")
... )

>>> func_strategies(df_pd)
a b a_filled b_filled
0 2.0 2.0 2.0 2.0
1 4.0 4.0 4.0 4.0
2 NaN NaN 4.0 4.0
3 NaN NaN NaN NaN
4 3.0 3.0 3.0 3.0
5 5.0 5.0 5.0 5.0

>>> func_strategies(df_pl) # nan != null for polars
shape: (6, 4)
β”Œβ”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ a ┆ b ┆ a_filled ┆ b_filled β”‚
β”‚ --- ┆ --- ┆ --- ┆ --- β”‚
β”‚ i64 ┆ f64 ┆ i64 ┆ f64 β”‚
β•žβ•β•β•β•β•β•β•ͺ═════β•ͺ══════════β•ͺ══════════║
β”‚ 2 ┆ 2.0 ┆ 2 ┆ 2.0 β”‚
β”‚ 4 ┆ 4.0 ┆ 4 ┆ 4.0 β”‚
β”‚ null ┆ NaN ┆ 4 ┆ NaN β”‚
β”‚ null ┆ NaN ┆ null ┆ NaN β”‚
β”‚ 3 ┆ 3.0 ┆ 3 ┆ 3.0 β”‚
β”‚ 5 ┆ 5.0 ┆ 5 ┆ 5.0 β”‚
β””β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

>>> func_strategies(df_pa) # nan != null for pyarrow
pyarrow.Table
a: int64
b: double
a_filled: int64
b_filled: double
----
a: [[2,4,null,null,3,5]]
b: [[2,4,nan,nan,3,5]]
a_filled: [[2,4,4,null,3,5]]
b_filled: [[2,4,nan,nan,3,5]]
"""
return self.__class__(lambda plx: self._call(plx).fill_null(value))
if value is not None and strategy is not None:
msg = "cannot specify both `value` and `strategy`"
raise ValueError(msg)
if value is None and strategy is None:
msg = "must specify either a fill `value` or `strategy`"
raise ValueError(msg)
if strategy is not None and strategy not in {"forward", "backward"}:
msg = f"strategy not supported: {strategy}"
raise ValueError(msg)
return self.__class__(
lambda plx: self._call(plx).fill_null(
value=value, strategy=strategy, limit=limit
)
)

# --- partial reduction ---
def drop_nulls(self) -> Self:
Expand Down
45 changes: 43 additions & 2 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1442,13 +1442,22 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def fill_null(self, value: Any) -> Self:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
"""
Fill null values using the specified value.

Arguments:
value: Value used to fill null values.

strategy: Strategy used to fill null values.

limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.

Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Expand Down Expand Up @@ -1482,8 +1491,40 @@ def fill_null(self, value: Any) -> Self:
2
5
]

Using a strategy:

>>> @nw.narwhalify
... def func_strategies(s):
... return s.fill_null(strategy="forward", limit=1)

>>> func_strategies(s_pd)
0 1.0
1 2.0
2 2.0
dtype: float64

>>> func_strategies(s_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [i64]
[
1
2
2
]
"""
return self._from_compliant_series(self._compliant_series.fill_null(value))
if value is not None and strategy is not None:
msg = "cannot specify both `value` and `strategy`"
raise ValueError(msg)
if value is None and strategy is None:
msg = "must specify either a fill `value` or `strategy`"
raise ValueError(msg)
if strategy is not None and strategy not in {"forward", "backward"}:
msg = f"strategy not supported: {strategy}"
raise ValueError(msg)
return self._from_compliant_series(
self._compliant_series.fill_null(value=value, strategy=strategy, limit=limit)
)

def is_between(
self, lower_bound: Any, upper_bound: Any, closed: str = "both"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ filterwarnings = [
'ignore:.*You are using pyarrow version',
'ignore:.*but when imported by',
'ignore:Distributing .*This may take some time',
'ignore:.*The default coalesce behavior'
'ignore:.*The default coalesce behavior',
]
xfail_strict = true
markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
Expand Down
Loading
Loading