feat: add more parameters to the fill_null method (#1149)

narwhals-dev · Nov 12, 2024 · c694148 · c694148
1 parent b9d5fe5
commit c694148
Show file tree

Hide file tree

Showing 9 changed files with 542 additions and 40 deletions.
diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -308,8 +308,15 @@ def sample(
             seed=seed,
         )
 
-    def fill_null(self: Self, value: Any) -> Self:
-        return reuse_series_implementation(self, "fill_null", value=value)
+    def fill_null(
+        self: Self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> Self:
+        return reuse_series_implementation(
+            self, "fill_null", value=value, strategy=strategy, limit=limit
+        )
 
     def is_duplicated(self: Self) -> Self:
         return reuse_series_implementation(self, "is_duplicated")

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -580,14 +580,57 @@ def sample(
 
         return self._from_native_series(pc.take(ser, mask))
 
-    def fill_null(self: Self, value: Any) -> Self:
+    def fill_null(
+        self: Self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> Self:
+        import numpy as np  # ignore-banned-import
         import pyarrow as pa  # ignore-banned-import()
         import pyarrow.compute as pc  # ignore-banned-import()
 
+        def fill_aux(
+            arr: pa.Array,
+            limit: int,
+            direction: Literal["forward", "backward"] | None = None,
+        ) -> pa.Array:
+            # this algorithm first finds the indices of the valid values to fill all the null value positions
+            # then it calculates the distance of each new index and the original index
+            # if the distance is equal to or less than the limit and the original value is null, it is replaced
+            valid_mask = pc.is_valid(arr)
+            indices = pa.array(np.arange(len(arr)), type=pa.int64())
+            if direction == "forward":
+                valid_index = np.maximum.accumulate(np.where(valid_mask, indices, -1))
+                distance = indices - valid_index
+            else:
+                valid_index = np.minimum.accumulate(
+                    np.where(valid_mask[::-1], indices[::-1], len(arr))
+                )[::-1]
+                distance = valid_index - indices
+            return pc.if_else(
+                pc.and_(
+                    pc.is_null(arr),
+                    pc.less_equal(distance, pa.scalar(limit)),
+                ),
+                arr.take(valid_index),
+                arr,
+            )
+
         ser = self._native_series
         dtype = ser.type
 
-        return self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
+        if value is not None:
+            res_ser = self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype)))
+        elif limit is None:
+            fill_func = (
+                pc.fill_null_forward if strategy == "forward" else pc.fill_null_backward
+            )
+            res_ser = self._from_native_series(fill_func(ser))
+        else:
+            res_ser = self._from_native_series(fill_aux(ser, limit, strategy))
+
+        return res_ser
 
     def to_frame(self: Self) -> ArrowDataFrame:
         import pyarrow as pa  # ignore-banned-import()

diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -533,11 +533,34 @@ def any(self) -> Self:
             returns_scalar=True,
         )
 
-    def fill_null(self, value: Any) -> DaskExpr:
+    def fill_null(
+        self: Self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> DaskExpr:
+        def func(
+            _input: dask_expr.Series,
+            value: Any | None,
+            strategy: str | None,
+            limit: int | None,
+        ) -> dask_expr.Series:
+            if value is not None:
+                res_ser = _input.fillna(value)
+            else:
+                res_ser = (
+                    _input.ffill(limit=limit)
+                    if strategy == "forward"
+                    else _input.bfill(limit=limit)
+                )
+            return res_ser
+
         return self._from_call(
-            lambda _input, _val: _input.fillna(_val),
+            func,
             "fillna",
             value,
+            strategy,
+            limit,
             returns_scalar=False,
         )
 

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -261,8 +261,15 @@ def is_between(
     def is_null(self) -> Self:
         return reuse_series_implementation(self, "is_null")
 
-    def fill_null(self, value: Any) -> Self:
-        return reuse_series_implementation(self, "fill_null", value=value)
+    def fill_null(
+        self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> Self:
+        return reuse_series_implementation(
+            self, "fill_null", value=value, strategy=strategy, limit=limit
+        )
 
     def is_in(self, other: Any) -> Self:
         return reuse_series_implementation(self, "is_in", other=other)

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -451,9 +451,23 @@ def is_null(self) -> PandasLikeSeries:
         ser = self._native_series
         return self._from_native_series(ser.isna())
 
-    def fill_null(self, value: Any) -> PandasLikeSeries:
+    def fill_null(
+        self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> PandasLikeSeries:
         ser = self._native_series
-        return self._from_native_series(ser.fillna(value))
+        if value is not None:
+            res_ser = self._from_native_series(ser.fillna(value=value))
+        else:
+            res_ser = self._from_native_series(
+                ser.ffill(limit=limit)
+                if strategy == "forward"
+                else ser.bfill(limit=limit)
+            )
+
+        return res_ser
 
     def drop_nulls(self) -> PandasLikeSeries:
         ser = self._native_series

diff --git a/narwhals/expr.py b/narwhals/expr.py
@@ -1541,10 +1541,22 @@ def arg_true(self) -> Self:
         """
         return self.__class__(lambda plx: self._call(plx).arg_true())
 
-    def fill_null(self, value: Any) -> Self:
+    def fill_null(
+        self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> Self:
         """
         Fill null values with given value.
 
+        Arguments:
+            value: Value used to fill null values.
+
+            strategy: Strategy used to fill null values.
+
+            limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.
+
         Notes:
             pandas and Polars handle null values differently. Polars distinguishes
             between NaN and Null, whereas pandas doesn't.
@@ -1555,13 +1567,22 @@ def fill_null(self, value: Any) -> Self:
             >>> import pyarrow as pa
             >>> import narwhals as nw
             >>> df_pd = pd.DataFrame(
-            ...     {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
+            ...     {
+            ...         "a": [2, 4, None, None, 3, 5],
+            ...         "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
+            ...     }
             ... )
             >>> df_pl = pl.DataFrame(
-            ...     {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
+            ...     {
+            ...         "a": [2, 4, None, None, 3, 5],
+            ...         "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
+            ...     }
             ... )
             >>> df_pa = pa.table(
-            ...     {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
+            ...     {
+            ...         "a": [2, 4, None, None, 3, 5],
+            ...         "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0],
+            ...     }
             ... )
 
             Let's define a dataframe-agnostic function:
@@ -1577,11 +1598,12 @@ def fill_null(self, value: Any) -> Self:
             0  2.0  2.0
             1  4.0  4.0
             2  0.0  0.0
-            3  3.0  3.0
-            4  5.0  5.0
+            3  0.0  0.0
+            4  3.0  3.0
+            5  5.0  5.0
 
             >>> func(df_pl)  # nan != null for polars
-            shape: (5, 2)
+            shape: (6, 2)
             ┌─────┬─────┐
             │ a   ┆ b   │
             │ --- ┆ --- │
@@ -1590,6 +1612,7 @@ def fill_null(self, value: Any) -> Self:
             │ 2   ┆ 2.0 │
             │ 4   ┆ 4.0 │
             │ 0   ┆ NaN │
+            │ 0   ┆ NaN │
             │ 3   ┆ 3.0 │
             │ 5   ┆ 5.0 │
             └─────┴─────┘
@@ -1599,10 +1622,69 @@ def fill_null(self, value: Any) -> Self:
             a: int64
             b: double
             ----
-            a: [[2,4,0,3,5]]
-            b: [[2,4,nan,3,5]]
+            a: [[2,4,0,0,3,5]]
+            b: [[2,4,nan,nan,3,5]]
+
+            Using a strategy:
+
+            >>> @nw.narwhalify
+            ... def func_strategies(df):
+            ...     return df.with_columns(
+            ...         nw.col("a", "b")
+            ...         .fill_null(strategy="forward", limit=1)
+            ...         .name.suffix("_filled")
+            ...     )
+
+            >>> func_strategies(df_pd)
+                 a    b  a_filled  b_filled
+            0  2.0  2.0       2.0       2.0
+            1  4.0  4.0       4.0       4.0
+            2  NaN  NaN       4.0       4.0
+            3  NaN  NaN       NaN       NaN
+            4  3.0  3.0       3.0       3.0
+            5  5.0  5.0       5.0       5.0
+
+            >>> func_strategies(df_pl)  # nan != null for polars
+            shape: (6, 4)
+            ┌──────┬─────┬──────────┬──────────┐
+            │ a    ┆ b   ┆ a_filled ┆ b_filled │
+            │ ---  ┆ --- ┆ ---      ┆ ---      │
+            │ i64  ┆ f64 ┆ i64      ┆ f64      │
+            ╞══════╪═════╪══════════╪══════════╡
+            │ 2    ┆ 2.0 ┆ 2        ┆ 2.0      │
+            │ 4    ┆ 4.0 ┆ 4        ┆ 4.0      │
+            │ null ┆ NaN ┆ 4        ┆ NaN      │
+            │ null ┆ NaN ┆ null     ┆ NaN      │
+            │ 3    ┆ 3.0 ┆ 3        ┆ 3.0      │
+            │ 5    ┆ 5.0 ┆ 5        ┆ 5.0      │
+            └──────┴─────┴──────────┴──────────┘
+
+            >>> func_strategies(df_pa)  # nan != null for pyarrow
+            pyarrow.Table
+            a: int64
+            b: double
+            a_filled: int64
+            b_filled: double
+            ----
+            a: [[2,4,null,null,3,5]]
+            b: [[2,4,nan,nan,3,5]]
+            a_filled: [[2,4,4,null,3,5]]
+            b_filled: [[2,4,nan,nan,3,5]]
         """
-        return self.__class__(lambda plx: self._call(plx).fill_null(value))
+        if value is not None and strategy is not None:
+            msg = "cannot specify both `value` and `strategy`"
+            raise ValueError(msg)
+        if value is None and strategy is None:
+            msg = "must specify either a fill `value` or `strategy`"
+            raise ValueError(msg)
+        if strategy is not None and strategy not in {"forward", "backward"}:
+            msg = f"strategy not supported: {strategy}"
+            raise ValueError(msg)
+        return self.__class__(
+            lambda plx: self._call(plx).fill_null(
+                value=value, strategy=strategy, limit=limit
+            )
+        )
 
     # --- partial reduction ---
     def drop_nulls(self) -> Self:

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -1565,13 +1565,22 @@ def is_null(self) -> Self:
         """
         return self._from_compliant_series(self._compliant_series.is_null())
 
-    def fill_null(self, value: Any) -> Self:
+    def fill_null(
+        self,
+        value: Any | None = None,
+        strategy: Literal["forward", "backward"] | None = None,
+        limit: int | None = None,
+    ) -> Self:
         """
         Fill null values using the specified value.
 
         Arguments:
             value: Value used to fill null values.
 
+            strategy: Strategy used to fill null values.
+
+            limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy.
+
         Notes:
             pandas and Polars handle null values differently. Polars distinguishes
             between NaN and Null, whereas pandas doesn't.
@@ -1605,8 +1614,40 @@ def fill_null(self, value: Any) -> Self:
                2
                5
             ]
+
+            Using a strategy:
+
+            >>> @nw.narwhalify
+            ... def func_strategies(s):
+            ...     return s.fill_null(strategy="forward", limit=1)
+
+            >>> func_strategies(s_pd)
+            0    1.0
+            1    2.0
+            2    2.0
+            dtype: float64
+
+            >>> func_strategies(s_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (3,)
+            Series: '' [i64]
+            [
+               1
+               2
+               2
+            ]
         """
-        return self._from_compliant_series(self._compliant_series.fill_null(value))
+        if value is not None and strategy is not None:
+            msg = "cannot specify both `value` and `strategy`"
+            raise ValueError(msg)
+        if value is None and strategy is None:
+            msg = "must specify either a fill `value` or `strategy`"
+            raise ValueError(msg)
+        if strategy is not None and strategy not in {"forward", "backward"}:
+            msg = f"strategy not supported: {strategy}"
+            raise ValueError(msg)
+        return self._from_compliant_series(
+            self._compliant_series.fill_null(value=value, strategy=strategy, limit=limit)
+        )
 
     def is_between(
         self, lower_bound: Any, upper_bound: Any, closed: str = "both"

diff --git a/pyproject.toml b/pyproject.toml
@@ -116,7 +116,7 @@ filterwarnings = [
   'ignore:.*You are using pyarrow version',
   'ignore:.*but when imported by',
   'ignore:Distributing .*This may take some time',
-  'ignore:.*The default coalesce behavior'
+  'ignore:.*The default coalesce behavior',
 ]
 xfail_strict = true
 markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]