Skip to content

Commit

Permalink
feat: add is_nan expression & series method (#1625)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Marco Edward Gorelli <[email protected]>
  • Loading branch information
camriddell and MarcoGorelli authored Jan 3, 2025
1 parent 6b823f3 commit 1b3196b
Show file tree
Hide file tree
Showing 12 changed files with 294 additions and 2 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_unique
- len
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_sorted
- is_unique
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,9 @@ def null_count(self: Self) -> Self:
def is_null(self: Self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self: Self) -> Self:
return reuse_series_implementation(self, "is_nan")

def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self:
return reuse_series_implementation(
self,
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,11 @@ def is_null(self: Self) -> Self:
ser = self._native_series
return self._from_native_series(ser.is_null())

def is_nan(self: Self) -> Self:
import pyarrow.compute as pc

return self._from_native_series(pc.is_nan(self._native_series))

def cast(self: Self, dtype: DType) -> Self:
import pyarrow.compute as pc

Expand Down
15 changes: 15 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from narwhals._pandas_like.utils import calculate_timestamp_datetime
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals.exceptions import ColumnNotFoundError
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantExpr
from narwhals.utils import Implementation
from narwhals.utils import generate_temporary_column_name
Expand Down Expand Up @@ -706,6 +707,20 @@ def is_null(self: Self) -> Self:
returns_scalar=self._returns_scalar,
)

def is_nan(self: Self) -> Self:
def func(_input: dask_expr.Series) -> dask_expr.Series:
dtype = native_to_narwhals_dtype(_input, self._version, self._implementation)
if dtype.is_numeric():
return _input != _input # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?"
raise InvalidOperationError(msg)

return self._from_call(
func,
"is_null",
returns_scalar=self._returns_scalar,
)

def len(self: Self) -> Self:
return self._from_call(
lambda _input: _input.size,
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self) -> Self:
return reuse_series_implementation(self, "is_nan")

def fill_null(
self,
value: Any | None = None,
Expand Down
10 changes: 8 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from narwhals._pandas_like.utils import set_index
from narwhals._pandas_like.utils import to_datetime
from narwhals.dependencies import is_numpy_scalar
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantSeries
from narwhals.utils import Implementation
from narwhals.utils import import_dtypes_module
Expand Down Expand Up @@ -623,8 +624,6 @@ def mean(self) -> Any:
return ser.mean()

def median(self) -> Any:
from narwhals.exceptions import InvalidOperationError

if not self.dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
Expand Down Expand Up @@ -663,6 +662,13 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def is_nan(self) -> PandasLikeSeries:
ser = self._native_series
if self.dtype.is_numeric():
return self._from_native_series(ser != ser) # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtype and not {self.dtype}, did you mean `.is_null`?"
raise InvalidOperationError(msg)

def fill_null(
self,
value: Any | None = None,
Expand Down
9 changes: 9 additions & 0 deletions narwhals/_polars/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ def ewm_mean(
)
return self._from_native_expr(native_expr)

def is_nan(self: Self) -> Self:
if self._backend_version < (1, 18): # pragma: no cover
import polars as pl

return self._from_native_expr(
pl.when(self._native_expr.is_not_null()).then(self._native_expr.is_nan())
)
return self._from_native_expr(self._native_expr.is_nan())

def rolling_var(
self: Self,
window_size: int,
Expand Down
13 changes: 13 additions & 0 deletions narwhals/_polars/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,19 @@ def __rpow__(self: Self, other: PolarsSeries | Any) -> Self:
def __invert__(self: Self) -> Self:
return self._from_native_series(self._native_series.__invert__())

def is_nan(self: Self) -> Self:
import polars as pl

native = self._native_series

if self._backend_version < (1, 18): # pragma: no cover
return self._from_native_series(
pl.select(pl.when(native.is_not_null()).then(native.is_nan()))[
native.name
]
)
return self._from_native_series(native.is_nan())

def median(self: Self) -> Any:
from narwhals.exceptions import InvalidOperationError

Expand Down
64 changes: 64 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1925,6 +1925,70 @@ def is_null(self) -> Self:
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null())

def is_nan(self) -> Self:
"""Indicate which values are NaN.
Returns:
A new expression.
Notes:
pandas handles null values differently from Polars and PyArrow.
See [null_handling](../pandas_like_concepts/null_handling.md/)
for reference.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> data = {"orig": [0.0, None, 2.0]}
>>> df_pd = pd.DataFrame(data).astype({"orig": "Float64"})
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)
Let's define a dataframe-agnostic function:
>>> def agnostic_self_div_is_nan(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_columns(
... divided=nw.col("orig") / nw.col("orig"),
... divided_is_nan=(nw.col("orig") / nw.col("orig")).is_nan(),
... ).to_native()
We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_self_div_is_nan`:
>>> print(agnostic_self_div_is_nan(df_pd))
orig divided divided_is_nan
0 0.0 NaN True
1 <NA> <NA> <NA>
2 2.0 1.0 False
>>> print(agnostic_self_div_is_nan(df_pl))
shape: (3, 3)
┌──────┬─────────┬────────────────┐
│ orig ┆ divided ┆ divided_is_nan │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ bool │
╞══════╪═════════╪════════════════╡
│ 0.0 ┆ NaN ┆ true │
│ null ┆ null ┆ null │
│ 2.0 ┆ 1.0 ┆ false │
└──────┴─────────┴────────────────┘
>>> print(agnostic_self_div_is_nan(df_pa))
pyarrow.Table
orig: double
divided: double
divided_is_nan: bool
----
orig: [[0,null,2]]
divided: [[nan,null,1]]
divided_is_nan: [[true,null,false]]
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan())

def arg_true(self) -> Self:
"""Find elements where boolean expression is True.
Expand Down
53 changes: 53 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2437,6 +2437,59 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are NaN.
Returns:
A boolean Series indicating which values are NaN.
Notes:
pandas handles null values differently from Polars and PyArrow.
See [null_handling](../pandas_like_concepts/null_handling.md/)
for reference.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT
>>> data = [0.0, None, 2.0]
>>> s_pd = pd.Series(data, dtype="Float64")
>>> s_pl = pl.Series(data)
>>> s_pa = pa.chunked_array([data], type=pa.float64())
>>> def agnostic_self_div_is_nan(s_native: IntoSeriesT) -> IntoSeriesT:
... s = nw.from_native(s_native, series_only=True)
... return s.is_nan().to_native()
>>> print(agnostic_self_div_is_nan(s_pd))
0 False
1 <NA>
2 False
dtype: boolean
>>> print(agnostic_self_div_is_nan(s_pl)) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [bool]
[
false
null
false
]
>>> print(agnostic_self_div_is_nan(s_pa)) # doctest: +NORMALIZE_WHITESPACE
[
[
false,
null,
false
]
]
"""
return self._from_compliant_series(self._compliant_series.is_nan())

def fill_null(
self,
value: Any | None = None,
Expand Down
119 changes: 119 additions & 0 deletions tests/expr_and_series/is_nan_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from __future__ import annotations

from typing import Any

import pytest

import narwhals.stable.v1 as nw
from tests.conftest import dask_lazy_p1_constructor
from tests.conftest import dask_lazy_p2_constructor
from tests.conftest import modin_constructor
from tests.conftest import pandas_constructor
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [
pandas_constructor,
dask_lazy_p1_constructor,
dask_lazy_p2_constructor,
modin_constructor,
]


def test_nan(constructor: Constructor) -> None:
data_na = {"int": [0, 1, None]}
df = nw.from_native(constructor(data_na)).with_columns(
float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int")
)
result = df.select(
int=nw.col("int").is_nan(),
float=nw.col("float").is_nan(),
float_na=nw.col("float_na").is_nan(),
)

expected: dict[str, list[Any]]
if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {
"int": [False, False, True],
"float": [False, False, True],
"float_na": [True, False, True],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [True, False, None],
}

assert_equal_data(result, expected)


def test_nan_series(constructor_eager: ConstructorEager) -> None:
data_na = {"int": [0, 1, None]}
df = nw.from_native(constructor_eager(data_na), eager_only=True).with_columns(
float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int")
)

result = {
"int": df["int"].is_nan(),
"float": df["float"].is_nan(),
"float_na": df["float_na"].is_nan(),
}
expected: dict[str, list[Any]]
if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {
"int": [False, False, True],
"float": [False, False, True],
"float_na": [True, False, True],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [True, False, None],
}

assert_equal_data(result, expected)


def test_nan_non_float(constructor: Constructor) -> None:
from polars.exceptions import InvalidOperationError as PlInvalidOperationError
from pyarrow.lib import ArrowNotImplementedError

from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError

data = {"a": ["x", "y"]}
df = nw.from_native(constructor(data))

exc = NwInvalidOperationError
if "polars" in str(constructor):
exc = PlInvalidOperationError
elif "pyarrow_table" in str(constructor):
exc = ArrowNotImplementedError

with pytest.raises(exc):
df.select(nw.col("a").is_nan()).lazy().collect()


def test_nan_non_float_series(constructor_eager: ConstructorEager) -> None:
from polars.exceptions import InvalidOperationError as PlInvalidOperationError
from pyarrow.lib import ArrowNotImplementedError

from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError

data = {"a": ["x", "y"]}
df = nw.from_native(constructor_eager(data), eager_only=True)

exc = NwInvalidOperationError
if "polars" in str(constructor_eager):
exc = PlInvalidOperationError
elif "pyarrow_table" in str(constructor_eager):
exc = ArrowNotImplementedError

with pytest.raises(exc):
df["a"].is_nan()

0 comments on commit 1b3196b

Please sign in to comment.