Skip to content

Commit

Permalink
feat: add more parameters to the fill_null method (#1149)
Browse files Browse the repository at this point in the history
  • Loading branch information
IsaiasGutierrezCruz authored Nov 12, 2024
1 parent b9d5fe5 commit c694148
Show file tree
Hide file tree
Showing 9 changed files with 542 additions and 40 deletions.
11 changes: 9 additions & 2 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,15 @@ def sample(
seed=seed,
)

def fill_null(self: Self, value: Any) -> Self:
return reuse_series_implementation(self, "fill_null", value=value)
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
return reuse_series_implementation(
self, "fill_null", value=value, strategy=strategy, limit=limit
)

def is_duplicated(self: Self) -> Self:
return reuse_series_implementation(self, "is_duplicated")
Expand Down
47 changes: 45 additions & 2 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,14 +580,57 @@ def sample(

return self._from_native_series(pc.take(ser, mask))

def fill_null(self: Self, value: Any) -> Self:
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
import numpy as np # ignore-banned-import
import pyarrow as pa # ignore-banned-import()
import pyarrow.compute as pc # ignore-banned-import()

def fill_aux(
arr: pa.Array,
limit: int,
direction: Literal["forward", "backward"] | None = None,
) -> pa.Array:
# this algorithm first finds the indices of the valid values to fill all the null value positions
# then it calculates the distance of each new index and the original index
# if the distance is equal to or less than the limit and the original value is null, it is replaced
valid_mask = pc.is_valid(arr)
indices = pa.array(np.arange(len(arr)), type=pa.int64())
if direction == "forward":
valid_index = np.maximum.accumulate(np.where(valid_mask, indices, -1))
distance = indices - valid_index
else:
valid_index = np.minimum.accumulate(
np.where(valid_mask[::-1], indices[::-1], len(arr))
)[::-1]
distance = valid_index - indices
return pc.if_else(
pc.and_(
pc.is_null(arr),
pc.less_equal(distance, pa.scalar(limit)),
),
arr.take(valid_index),
arr,
)

ser = self._native_series
dtype = ser.type

return self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
if value is not None:
res_ser = self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
elif limit is None:
fill_func = (
pc.fill_null_forward if strategy == "forward" else pc.fill_null_backward
)
res_ser = self._from_native_series(fill_func(ser))
else:
res_ser = self._from_native_series(fill_aux(ser, limit, strategy))

return res_ser

def to_frame(self: Self) -> ArrowDataFrame:
import pyarrow as pa # ignore-banned-import()
Expand Down
27 changes: 25 additions & 2 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,11 +533,34 @@ def any(self) -> Self:
returns_scalar=True,
)

def fill_null(self, value: Any) -> DaskExpr:
def fill_null(
self: Self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> DaskExpr:
def func(
_input: dask_expr.Series,
value: Any | None,
strategy: str | None,
limit: int | None,
) -> dask_expr.Series:
if value is not None:
res_ser = _input.fillna(value)
else:
res_ser = (
_input.ffill(limit=limit)
if strategy == "forward"
else _input.bfill(limit=limit)
)
return res_ser

return self._from_call(
lambda _input, _val: _input.fillna(_val),
func,
"fillna",
value,
strategy,
limit,
returns_scalar=False,
)

Expand Down
11 changes: 9 additions & 2 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,15 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def fill_null(self, value: Any) -> Self:
return reuse_series_implementation(self, "fill_null", value=value)
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
return reuse_series_implementation(
self, "fill_null", value=value, strategy=strategy, limit=limit
)

def is_in(self, other: Any) -> Self:
return reuse_series_implementation(self, "is_in", other=other)
Expand Down
18 changes: 16 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,9 +451,23 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def fill_null(self, value: Any) -> PandasLikeSeries:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.fillna(value))
if value is not None:
res_ser = self._from_native_series(ser.fillna(value=value))
else:
res_ser = self._from_native_series(
ser.ffill(limit=limit)
if strategy == "forward"
else ser.bfill(limit=limit)
)

return res_ser

def drop_nulls(self) -> PandasLikeSeries:
ser = self._native_series
Expand Down
102 changes: 92 additions & 10 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1541,10 +1541,22 @@ def arg_true(self) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).arg_true())

def fill_null(self, value: Any) -> Self:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
"""
Fill null values with given value.
Arguments:
value: Value used to fill null values.
strategy: Strategy used to fill null values.
limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.
Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Expand All @@ -1555,13 +1567,22 @@ def fill_null(self, value: Any) -> Self:
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )
>>> df_pl = pl.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )
>>> df_pa = pa.table(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... {
... "a": [2, 4, None, None, 3, 5],
... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
... }
... )
Let's define a dataframe-agnostic function:
Expand All @@ -1577,11 +1598,12 @@ def fill_null(self, value: Any) -> Self:
0 2.0 2.0
1 4.0 4.0
2 0.0 0.0
3 3.0 3.0
4 5.0 5.0
3 0.0 0.0
4 3.0 3.0
5 5.0 5.0
>>> func(df_pl) # nan != null for polars
shape: (5, 2)
shape: (6, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
Expand All @@ -1590,6 +1612,7 @@ def fill_null(self, value: Any) -> Self:
│ 2 ┆ 2.0 │
│ 4 ┆ 4.0 │
│ 0 ┆ NaN │
│ 0 ┆ NaN │
│ 3 ┆ 3.0 │
│ 5 ┆ 5.0 │
└─────┴─────┘
Expand All @@ -1599,10 +1622,69 @@ def fill_null(self, value: Any) -> Self:
a: int64
b: double
----
a: [[2,4,0,3,5]]
b: [[2,4,nan,3,5]]
a: [[2,4,0,0,3,5]]
b: [[2,4,nan,nan,3,5]]
Using a strategy:
>>> @nw.narwhalify
... def func_strategies(df):
... return df.with_columns(
... nw.col("a", "b")
... .fill_null(strategy="forward", limit=1)
... .name.suffix("_filled")
... )
>>> func_strategies(df_pd)
a b a_filled b_filled
0 2.0 2.0 2.0 2.0
1 4.0 4.0 4.0 4.0
2 NaN NaN 4.0 4.0
3 NaN NaN NaN NaN
4 3.0 3.0 3.0 3.0
5 5.0 5.0 5.0 5.0
>>> func_strategies(df_pl) # nan != null for polars
shape: (6, 4)
┌──────┬─────┬──────────┬──────────┐
│ a ┆ b ┆ a_filled ┆ b_filled │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ i64 ┆ f64 │
╞══════╪═════╪══════════╪══════════╡
│ 2 ┆ 2.0 ┆ 2 ┆ 2.0 │
│ 4 ┆ 4.0 ┆ 4 ┆ 4.0 │
│ null ┆ NaN ┆ 4 ┆ NaN │
│ null ┆ NaN ┆ null ┆ NaN │
│ 3 ┆ 3.0 ┆ 3 ┆ 3.0 │
│ 5 ┆ 5.0 ┆ 5 ┆ 5.0 │
└──────┴─────┴──────────┴──────────┘
>>> func_strategies(df_pa) # nan != null for pyarrow
pyarrow.Table
a: int64
b: double
a_filled: int64
b_filled: double
----
a: [[2,4,null,null,3,5]]
b: [[2,4,nan,nan,3,5]]
a_filled: [[2,4,4,null,3,5]]
b_filled: [[2,4,nan,nan,3,5]]
"""
return self.__class__(lambda plx: self._call(plx).fill_null(value))
if value is not None and strategy is not None:
msg = "cannot specify both `value` and `strategy`"
raise ValueError(msg)
if value is None and strategy is None:
msg = "must specify either a fill `value` or `strategy`"
raise ValueError(msg)
if strategy is not None and strategy not in {"forward", "backward"}:
msg = f"strategy not supported: {strategy}"
raise ValueError(msg)
return self.__class__(
lambda plx: self._call(plx).fill_null(
value=value, strategy=strategy, limit=limit
)
)

# --- partial reduction ---
def drop_nulls(self) -> Self:
Expand Down
45 changes: 43 additions & 2 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1565,13 +1565,22 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def fill_null(self, value: Any) -> Self:
def fill_null(
self,
value: Any | None = None,
strategy: Literal["forward", "backward"] | None = None,
limit: int | None = None,
) -> Self:
"""
Fill null values using the specified value.
Arguments:
value: Value used to fill null values.
strategy: Strategy used to fill null values.
limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.
Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Expand Down Expand Up @@ -1605,8 +1614,40 @@ def fill_null(self, value: Any) -> Self:
2
5
]
Using a strategy:
>>> @nw.narwhalify
... def func_strategies(s):
... return s.fill_null(strategy="forward", limit=1)
>>> func_strategies(s_pd)
0 1.0
1 2.0
2 2.0
dtype: float64
>>> func_strategies(s_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [i64]
[
1
2
2
]
"""
return self._from_compliant_series(self._compliant_series.fill_null(value))
if value is not None and strategy is not None:
msg = "cannot specify both `value` and `strategy`"
raise ValueError(msg)
if value is None and strategy is None:
msg = "must specify either a fill `value` or `strategy`"
raise ValueError(msg)
if strategy is not None and strategy not in {"forward", "backward"}:
msg = f"strategy not supported: {strategy}"
raise ValueError(msg)
return self._from_compliant_series(
self._compliant_series.fill_null(value=value, strategy=strategy, limit=limit)
)

def is_between(
self, lower_bound: Any, upper_bound: Any, closed: str = "both"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ filterwarnings = [
'ignore:.*You are using pyarrow version',
'ignore:.*but when imported by',
'ignore:Distributing .*This may take some time',
'ignore:.*The default coalesce behavior'
'ignore:.*The default coalesce behavior',
]
xfail_strict = true
markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
Expand Down
Loading

0 comments on commit c694148

Please sign in to comment.