From 76449910fe815a6e167614da6efa7e193126981f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 14 Jul 2024 18:52:43 +0100 Subject: [PATCH] feat: add "interchange"-level support for libraries which implement the interchange protocol (#517) * wip: support interchange protocol * raise on invalid attributes * raise on invalid attributes * typing * typing * typing * no default * fixup * fixup types * fixup types * fixup types * wip * fixup * coverage * coverage * coverage * coverage * match error * change signature * cov * cov * rename --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 + narwhals/_arrow/dataframe.py | 4 + narwhals/_interchange/__init__.py | 0 narwhals/_interchange/dataframe.py | 96 ++++++++++++++++++ narwhals/_interchange/series.py | 32 ++++++ narwhals/dataframe.py | 13 +++ narwhals/functions.py | 19 ++++ narwhals/series.py | 9 +- narwhals/stable/v1.py | 100 +++++++++++++++---- narwhals/translate.py | 104 ++++++++++++++++++-- narwhals/typing.py | 9 +- narwhals/utils.py | 9 ++ requirements-dev.txt | 1 + tests/frame/get_column_test.py | 4 +- tests/frame/interchange_schema_test.py | 100 +++++++++++++++++++ tests/frame/join_test.py | 2 +- tests/frame/test_invalid.py | 8 +- tests/series/is_ordered_categorical_test.py | 13 +++ tests/translate/from_native_test.py | 31 +++++- utils/check_api_reference.py | 16 ++- utils/check_backend_completeness.py | 27 +++-- utils/generate_random_versions.py | 2 +- 23 files changed, 545 insertions(+), 57 deletions(-) create mode 100644 narwhals/_interchange/__init__.py create mode 100644 narwhals/_interchange/dataframe.py create mode 100644 narwhals/_interchange/series.py create mode 100644 tests/frame/interchange_schema_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 06af4ddfa5..f6976b4c69 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -11,6 +11,7 @@ Here are the top-level functions available in Narwhals. - col - concat - from_native + - get_level - get_native_namespace - is_ordered_categorical - len diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 1a9739e00f..9f11b63708 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -33,6 +33,7 @@ from narwhals.expression import sum from narwhals.expression import sum_horizontal from narwhals.functions import concat +from narwhals.functions import get_level from narwhals.functions import show_versions from narwhals.series import Series from narwhals.translate import from_native @@ -49,6 +50,7 @@ __all__ = [ "selectors", "concat", + "get_level", "to_native", "from_native", "is_ordered_categorical", diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index b3ef89fce6..dba3eff101 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -67,6 +67,10 @@ def rows( def get_column(self, name: str) -> ArrowSeries: from narwhals._arrow.series import ArrowSeries + if not isinstance(name, str): + msg = f"Expected str, got: {type(name)}" + raise TypeError(msg) + return ArrowSeries( self._native_dataframe[name], name=name, diff --git a/narwhals/_interchange/__init__.py b/narwhals/_interchange/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/narwhals/_interchange/dataframe.py b/narwhals/_interchange/dataframe.py new file mode 100644 index 0000000000..16c8c54dee --- /dev/null +++ b/narwhals/_interchange/dataframe.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import enum +from typing import TYPE_CHECKING +from typing import Any +from typing import NoReturn + +from narwhals import dtypes + +if TYPE_CHECKING: + from narwhals._interchange.series import InterchangeSeries + + +class DtypeKind(enum.IntEnum): + # https://data-apis.org/dataframe-protocol/latest/API.html + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +def map_interchange_dtype_to_narwhals_dtype( + interchange_dtype: tuple[DtypeKind, int, Any, Any], +) -> dtypes.DType: + if interchange_dtype[0] == DtypeKind.INT: + if interchange_dtype[1] == 64: + return dtypes.Int64() + if interchange_dtype[1] == 32: + return dtypes.Int32() + if interchange_dtype[1] == 16: + return dtypes.Int16() + if interchange_dtype[1] == 8: + return dtypes.Int8() + raise AssertionError("Invalid bit width for INT") + if interchange_dtype[0] == DtypeKind.UINT: + if interchange_dtype[1] == 64: + return dtypes.UInt64() + if interchange_dtype[1] == 32: + return dtypes.UInt32() + if interchange_dtype[1] == 16: + return dtypes.UInt16() + if interchange_dtype[1] == 8: + return dtypes.UInt8() + raise AssertionError("Invalid bit width for UINT") + if interchange_dtype[0] == DtypeKind.FLOAT: + if interchange_dtype[1] == 64: + return dtypes.Float64() + if interchange_dtype[1] == 32: + return dtypes.Float32() + raise AssertionError("Invalid bit width for FLOAT") + if interchange_dtype[0] == DtypeKind.BOOL: + return dtypes.Boolean() + if interchange_dtype[0] == DtypeKind.STRING: + return dtypes.String() + if interchange_dtype[0] == DtypeKind.DATETIME: + return dtypes.Datetime() + if interchange_dtype[0] == DtypeKind.CATEGORICAL: # pragma: no cover + # upstream issue: https://github.com/ibis-project/ibis/issues/9570 + return dtypes.Categorical() + msg = f"Invalid dtype, got: {interchange_dtype}" # pragma: no cover + raise AssertionError(msg) + + +class InterchangeFrame: + def __init__(self, df: Any) -> None: + self._native_dataframe = df + + def __narwhals_dataframe__(self) -> Any: + return self + + def __getitem__(self, item: str) -> InterchangeSeries: + from narwhals._interchange.series import InterchangeSeries + + return InterchangeSeries(self._native_dataframe.get_column_by_name(item)) + + @property + def schema(self) -> dict[str, dtypes.DType]: + return { + column_name: map_interchange_dtype_to_narwhals_dtype( + self._native_dataframe.get_column_by_name(column_name).dtype + ) + for column_name in self._native_dataframe.column_names() + } + + def __getattr__(self, attr: str) -> NoReturn: + msg = ( + f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" + "Hint: you probably called `nw.from_native` on an object which isn't fully " + "supported by Narwhals, yet implements `__dataframe__`. If you would like to " + "see this kind of object supported in Narwhals, please open a feature request " + "at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) diff --git a/narwhals/_interchange/series.py b/narwhals/_interchange/series.py new file mode 100644 index 0000000000..06a9169df9 --- /dev/null +++ b/narwhals/_interchange/series.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import NoReturn + +from narwhals._interchange.dataframe import map_interchange_dtype_to_narwhals_dtype + +if TYPE_CHECKING: + from narwhals import dtypes + + +class InterchangeSeries: + def __init__(self, df: Any) -> None: + self._native_series = df + + def __narwhals_series__(self) -> Any: + return self + + @property + def dtype(self) -> dtypes.DType: + return map_interchange_dtype_to_narwhals_dtype(self._native_series.dtype) + + def __getattr__(self, attr: str) -> NoReturn: + msg = ( + f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" + "Hint: you probably called `nw.from_native` on an object which isn't fully " + "supported by Narwhals, yet implements `__dataframe__`. If you would like to " + "see this kind of object supported in Narwhals, please open a feature request " + "at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index bf1ab32fd9..c1c5e16574 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -38,6 +38,7 @@ class BaseFrame(Generic[FrameT]): _compliant_frame: Any _is_polars: bool _backend_version: tuple[int, ...] + _level: Literal["full", "interchange"] def __len__(self) -> Any: return self._compliant_frame.__len__() @@ -58,6 +59,7 @@ def _from_compliant_dataframe(self, df: Any) -> Self: df, is_polars=self._is_polars, backend_version=self._backend_version, + level=self._level, ) def _flatten_and_extract(self, *args: Any, **kwargs: Any) -> Any: @@ -119,6 +121,7 @@ def lazy(self) -> LazyFrame[Any]: self._compliant_frame.lazy(), is_polars=self._is_polars, backend_version=self._backend_version, + level=self._level, ) def with_columns( @@ -218,9 +221,11 @@ def __init__( *, backend_version: tuple[int, ...], is_polars: bool, + level: Literal["full", "interchange"], ) -> None: self._is_polars = is_polars self._backend_version = backend_version + self._level: Literal["full", "interchange"] = level if hasattr(df, "__narwhals_dataframe__"): self._compliant_frame: Any = df.__narwhals_dataframe__() elif is_polars and isinstance(df, get_polars().DataFrame): @@ -453,6 +458,7 @@ def get_column(self, name: str) -> Series: self._compliant_frame.get_column(name), backend_version=self._backend_version, is_polars=self._is_polars, + level=self._level, ) @overload @@ -522,6 +528,7 @@ def __getitem__( self._compliant_frame[item], backend_version=self._backend_version, is_polars=self._is_polars, + level=self._level, ) elif isinstance(item, (Sequence, slice)) or ( @@ -587,6 +594,7 @@ def to_dict( value, backend_version=self._backend_version, is_polars=self._is_polars, + level=self._level, ) for key, value in self._compliant_frame.to_dict( as_series=as_series @@ -1700,6 +1708,7 @@ def is_duplicated(self: Self) -> Series: self._compliant_frame.is_duplicated(), backend_version=self._backend_version, is_polars=self._is_polars, + level=self._level, ) def is_empty(self: Self) -> bool: @@ -1786,6 +1795,7 @@ def is_unique(self: Self) -> Series: self._compliant_frame.is_unique(), backend_version=self._backend_version, is_polars=self._is_polars, + level=self._level, ) def null_count(self: Self) -> Self: @@ -1927,9 +1937,11 @@ def __init__( *, is_polars: bool, backend_version: tuple[int, ...], + level: Literal["full", "interchange"], ) -> None: self._is_polars = is_polars self._backend_version = backend_version + self._level = level if hasattr(df, "__narwhals_lazyframe__"): self._compliant_frame: Any = df.__narwhals_lazyframe__() elif is_polars and ( @@ -1998,6 +2010,7 @@ def collect(self) -> DataFrame[Any]: self._compliant_frame.collect(), is_polars=self._is_polars, backend_version=self._backend_version, + level=self._level, ) # inherited diff --git a/narwhals/functions.py b/narwhals/functions.py index c7e2b5344c..9184a8cf34 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -2,6 +2,8 @@ import platform import sys +from typing import TYPE_CHECKING +from typing import Any from typing import Iterable from typing import Literal from typing import TypeVar @@ -17,6 +19,9 @@ # The rest of the annotations seem to work fine with this anyway FrameT = TypeVar("FrameT", bound=Union[DataFrame, LazyFrame]) # type: ignore[type-arg] +if TYPE_CHECKING: + from narwhals.series import Series + def concat( items: Iterable[FrameT], @@ -116,3 +121,17 @@ def show_versions() -> None: print("\nPython dependencies:") # noqa: T201 for k, stat in deps_info.items(): print(f"{k:>13}: {stat}") # noqa: T201 + + +def get_level( + obj: DataFrame[Any] | LazyFrame[Any] | Series, +) -> Literal["full", "interchange"]: + """ + Level of support Narwhals has for current object. + + This can be one of: + + - 'full': full Narwhals API support + - 'metadata': only metadata operations are supported (`df.schema`) + """ + return obj._level diff --git a/narwhals/series.py b/narwhals/series.py index b5880c7bfa..787275d346 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -32,9 +32,11 @@ def __init__( *, backend_version: tuple[int, ...], is_polars: bool, + level: Literal["full", "interchange"], ) -> None: self._is_polars = is_polars self._backend_version = backend_version + self._level = level if hasattr(series, "__narwhals_series__"): self._compliant_series = series.__narwhals_series__() elif is_polars and ( @@ -102,7 +104,10 @@ def _extract_native(self, arg: Any) -> Any: def _from_compliant_series(self, series: Any) -> Self: return self.__class__( - series, is_polars=self._is_polars, backend_version=self._backend_version + series, + is_polars=self._is_polars, + backend_version=self._backend_version, + level=self._level, ) def __repr__(self) -> str: # pragma: no cover @@ -296,6 +301,7 @@ def to_frame(self) -> DataFrame[Any]: self._compliant_series.to_frame(), is_polars=self._is_polars, backend_version=self._backend_version, + level=self._level, ) def to_list(self) -> list[Any]: @@ -1674,6 +1680,7 @@ def value_counts( self._compliant_series.value_counts(sort=sort, parallel=parallel), is_polars=self._is_polars, backend_version=self._backend_version, + level=self._level, ) def quantile( diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index dc0205193a..7704b3c7a6 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -50,7 +50,6 @@ from typing_extensions import Self from narwhals.dtypes import DType - from narwhals.typing import IntoDataFrame from narwhals.typing import IntoExpr T = TypeVar("T") @@ -435,30 +434,46 @@ def _stableify( obj._compliant_frame, is_polars=obj._is_polars, backend_version=obj._backend_version, + level=obj._level, ) if isinstance(obj, NwLazyFrame): return LazyFrame( obj._compliant_frame, is_polars=obj._is_polars, backend_version=obj._backend_version, + level=obj._level, ) if isinstance(obj, NwSeries): return Series( obj._compliant_series, is_polars=obj._is_polars, backend_version=obj._backend_version, + level=obj._level, ) if isinstance(obj, NwExpr): return Expr(obj._call) return obj +@overload +def from_native( + native_dataframe: Any, + *, + strict: Literal[False], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], + series_only: None = ..., + allow_series: Literal[True], +) -> Any: ... + + @overload def from_native( native_dataframe: Any, *, strict: Literal[False], eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> Any: ... @@ -466,13 +481,26 @@ def from_native( @overload def from_native( - native_dataframe: IntoDataFrame | T, + native_dataframe: IntoDataFrameT | T, + *, + strict: Literal[False], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], + series_only: None = ..., + allow_series: None = ..., +) -> DataFrame[IntoDataFrameT] | T: ... + + +@overload +def from_native( + native_dataframe: IntoDataFrameT | T, *, strict: Literal[False], eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., -) -> DataFrame[IntoDataFrame] | T: ... +) -> DataFrame[IntoDataFrameT] | T: ... @overload @@ -481,6 +509,7 @@ def from_native( *, strict: Literal[False], eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> Any: ... @@ -492,6 +521,7 @@ def from_native( *, strict: Literal[False], eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., ) -> Any: ... @@ -502,27 +532,26 @@ def from_native( native_dataframe: IntoFrameT | T, *, strict: Literal[False], - eager_only: bool | None = ..., - series_only: bool | None = ..., - allow_series: bool | None = ..., -) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT] | T: - """ - from_native(df, strict=False) - """ + eager_only: None = ..., + eager_or_interchange_only: None = ..., + series_only: None = ..., + allow_series: None = ..., +) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT] | T: ... @overload def from_native( - native_dataframe: Any, + native_dataframe: IntoDataFrameT, *, strict: Literal[True] = ..., - eager_only: Literal[True], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], series_only: None = ..., - allow_series: Literal[True], -) -> DataFrame[Any] | Series: + allow_series: None = ..., +) -> DataFrame[IntoDataFrameT]: """ - from_native(df, strict=True, eager_only=True, allow_series=True) - from_native(df, eager_only=True, allow_series=True) + from_native(df, strict=True, eager_or_interchange_only=True, allow_series=True) + from_native(df, eager_or_interchange_only=True, allow_series=True) """ @@ -532,12 +561,13 @@ def from_native( *, strict: Literal[True] = ..., eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoDataFrameT]: """ - from_native(df, strict=True, eager_only=True) - from_native(df, eager_only=True) + from_native(df, strict=True, eager_only=True, allow_series=True) + from_native(df, eager_only=True, allow_series=True) """ @@ -547,12 +577,13 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> DataFrame[Any] | LazyFrame[Any] | Series: """ - from_native(df, strict=True, allow_series=True) - from_native(df, allow_series=True) + from_native(df, strict=True, eager_only=True) + from_native(df, eager_only=True) """ @@ -562,6 +593,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., ) -> Series: @@ -577,6 +609,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT]: @@ -593,6 +626,7 @@ def from_native( *, strict: bool, eager_only: bool | None, + eager_or_interchange_only: bool | None = None, series_only: bool | None, allow_series: bool | None, ) -> Any: ... @@ -603,6 +637,7 @@ def from_native( *, strict: bool = True, eager_only: bool | None = None, + eager_or_interchange_only: bool | None = None, series_only: bool | None = None, allow_series: bool | None = None, ) -> Any: @@ -623,16 +658,24 @@ def from_native( strict: Whether to raise if object can't be converted (default) or to just leave it as-is. eager_only: Whether to only allow eager objects. + eager_or_interchange_only: Whether to only allow eager objects or objects which + implement the Dataframe Interchange Protocol. series_only: Whether to only allow series. allow_series: Whether to allow series (default is only dataframe / lazyframe). Returns: narwhals.DataFrame or narwhals.LazyFrame or narwhals.Series """ + # Early returns + if isinstance(native_dataframe, (DataFrame, LazyFrame)) and not series_only: + return native_dataframe + if isinstance(native_dataframe, Series) and (series_only or allow_series): + return native_dataframe result = nw.from_native( native_dataframe, strict=strict, eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, series_only=series_only, allow_series=allow_series, ) @@ -696,6 +739,8 @@ def func(df): strict: Whether to raise if object can't be converted or to just leave it as-is (default). eager_only: Whether to only allow eager objects. + eager_or_interchange_only: Whether to only allow eager objects or objects which + implement the Dataframe Interchange Protocol. series_only: Whether to only allow series. allow_series: Whether to allow series (default is only dataframe / lazyframe). """ @@ -1308,6 +1353,20 @@ def get_native_namespace(obj: Any) -> Any: return nw_get_native_namespace(obj) +def get_level( + obj: DataFrame[Any] | LazyFrame[Any] | Series, +) -> Literal["full", "interchange"]: + """ + Level of support Narwhals has for current object. + + This can be one of: + + - 'full': full Narwhals API support + - 'metadata': only metadata operations are supported (`df.schema`) + """ + return nw.get_level(obj) + + __all__ = [ "selectors", "concat", @@ -1318,6 +1377,7 @@ def get_native_namespace(obj: Any) -> Any: "maybe_convert_dtypes", "maybe_set_index", "get_native_namespace", + "get_level", "all", "all_horizontal", "col", diff --git a/narwhals/translate.py b/narwhals/translate.py index 6165309510..e13fb9b3c2 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -75,23 +75,49 @@ def to_native( return narwhals_object +@overload +def from_native( + native_object: Any, + *, + strict: Literal[False], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], + series_only: None = ..., + allow_series: Literal[True], +) -> Any: ... + + @overload def from_native( native_object: Any, *, strict: Literal[False], eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> Any: ... +@overload +def from_native( + native_object: IntoDataFrameT | T, + *, + strict: Literal[False], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], + series_only: None = ..., + allow_series: None = ..., +) -> DataFrame[IntoDataFrameT] | T: ... + + @overload def from_native( native_object: IntoDataFrameT | T, *, strict: Literal[False], eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoDataFrameT] | T: ... @@ -103,6 +129,7 @@ def from_native( *, strict: Literal[False], eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> Any: ... @@ -114,6 +141,7 @@ def from_native( *, strict: Literal[False], eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., ) -> Any: ... @@ -125,6 +153,7 @@ def from_native( *, strict: Literal[False], eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT] | T: ... @@ -132,15 +161,17 @@ def from_native( @overload def from_native( - native_object: Any, + native_object: IntoDataFrameT, *, strict: Literal[True] = ..., - eager_only: Literal[True], + eager_only: None = ..., + eager_or_interchange_only: Literal[True], series_only: None = ..., - allow_series: Literal[True], -) -> DataFrame[Any] | Series: + allow_series: None = ..., +) -> DataFrame[IntoDataFrameT]: """ - from_native(df, strict=False) + from_native(df, strict=True, eager_or_interchange_only=True, allow_series=True) + from_native(df, eager_or_interchange_only=True, allow_series=True) """ @@ -150,6 +181,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: Literal[True], + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoDataFrameT]: @@ -165,6 +197,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: Literal[True], ) -> DataFrame[Any] | LazyFrame[Any] | Series: @@ -180,6 +213,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: Literal[True], allow_series: None = ..., ) -> Series: @@ -195,6 +229,7 @@ def from_native( *, strict: Literal[True] = ..., eager_only: None = ..., + eager_or_interchange_only: None = ..., series_only: None = ..., allow_series: None = ..., ) -> DataFrame[IntoFrameT] | LazyFrame[IntoFrameT]: @@ -211,6 +246,7 @@ def from_native( *, strict: bool, eager_only: bool | None, + eager_or_interchange_only: bool | None = None, series_only: bool | None, allow_series: bool | None, ) -> Any: ... @@ -221,6 +257,7 @@ def from_native( # noqa: PLR0915 *, strict: bool = True, eager_only: bool | None = None, + eager_or_interchange_only: bool | None = None, series_only: bool | None = None, allow_series: bool | None = None, ) -> Any: @@ -241,6 +278,8 @@ def from_native( # noqa: PLR0915 strict: Whether to raise if object can't be converted (default) or to just leave it as-is. eager_only: Whether to only allow eager objects. + eager_or_interchange_only: Whether to only allow eager objects or objects which + implement the Dataframe Interchange Protocol. series_only: Whether to only allow series. allow_series: Whether to allow series (default is only dataframe / lazyframe). @@ -249,6 +288,7 @@ def from_native( # noqa: PLR0915 """ from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries + from narwhals._interchange.dataframe import InterchangeFrame from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries from narwhals._pandas_like.utils import Implementation @@ -257,11 +297,20 @@ def from_native( # noqa: PLR0915 from narwhals.series import Series from narwhals.utils import parse_version + # Early returns + if isinstance(native_object, (DataFrame, LazyFrame)) and not series_only: + return native_object + if isinstance(native_object, Series) and (series_only or allow_series): + return native_object + if series_only: if allow_series is False: msg = "Invalid parameter combination: `series_only=True` and `allow_series=False`" raise ValueError(msg) allow_series = True + if eager_only and eager_or_interchange_only: + msg = "Invalid parameter combination: `eager_only=True` and `eager_or_interchange_only=True`" + raise ValueError(msg) if (pl := get_polars()) is not None and isinstance(native_object, pl.DataFrame): if series_only: @@ -271,18 +320,20 @@ def from_native( # noqa: PLR0915 native_object, is_polars=True, backend_version=parse_version(pl.__version__), + level="full", ) elif (pl := get_polars()) is not None and isinstance(native_object, pl.LazyFrame): if series_only: msg = "Cannot only use `series_only` with polars.LazyFrame" raise TypeError(msg) - if eager_only: - msg = "Cannot only use `eager_only` with polars.LazyFrame" + if eager_only or eager_or_interchange_only: + msg = "Cannot only use `eager_only` or `eager_or_interchange_only` with polars.LazyFrame" raise TypeError(msg) return LazyFrame( native_object, is_polars=True, backend_version=parse_version(pl.__version__), + level="full", ) elif (pd := get_pandas()) is not None and isinstance(native_object, pd.DataFrame): if series_only: @@ -296,6 +347,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(pd.__version__), + level="full", ) elif (mpd := get_modin()) is not None and isinstance(native_object, mpd.DataFrame): if series_only: @@ -309,6 +361,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(mpd.__version__), + level="full", ) elif (cudf := get_cudf()) is not None and isinstance( # pragma: no cover native_object, cudf.DataFrame @@ -324,6 +377,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(cudf.__version__), + level="full", ) elif (pa := get_pyarrow()) is not None and isinstance(native_object, pa.Table): if series_only: @@ -333,6 +387,21 @@ def from_native( # noqa: PLR0915 ArrowDataFrame(native_object, backend_version=parse_version(pa.__version__)), is_polars=False, backend_version=parse_version(pa.__version__), + level="full", + ) + elif hasattr(native_object, "__dataframe__"): + if eager_only or series_only: + msg = ( + "Cannot only use `series_only=True` or `eager_only=False` " + "with object which only implements __dataframe__" + ) + raise TypeError(msg) + # placeholder (0,) version here, as we wouldn't use it in this case anyway. + return DataFrame( + InterchangeFrame(native_object.__dataframe__()), + is_polars=False, + backend_version=(0,), + level="interchange", ) elif hasattr(native_object, "__narwhals_dataframe__"): if series_only: @@ -343,19 +412,21 @@ def from_native( # noqa: PLR0915 native_object.__narwhals_dataframe__(), is_polars=False, backend_version=(0,), + level="full", ) elif hasattr(native_object, "__narwhals_lazyframe__"): if series_only: msg = "Cannot only use `series_only` with lazyframe" raise TypeError(msg) - if eager_only: - msg = "Cannot only use `eager_only` with lazyframe" + if eager_only or eager_or_interchange_only: + msg = "Cannot only use `eager_only` or `eager_or_interchange_only` with lazyframe" raise TypeError(msg) # placeholder (0,) version here, as we wouldn't use it in this case anyway. return LazyFrame( native_object.__narwhals_lazyframe__(), is_polars=False, backend_version=(0,), + level="full", ) elif (pl := get_polars()) is not None and isinstance(native_object, pl.Series): if not allow_series: @@ -365,6 +436,7 @@ def from_native( # noqa: PLR0915 native_object, is_polars=True, backend_version=parse_version(pl.__version__), + level="full", ) elif (pd := get_pandas()) is not None and isinstance(native_object, pd.Series): if not allow_series: @@ -378,6 +450,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(pd.__version__), + level="full", ) elif (mpd := get_modin()) is not None and isinstance(native_object, mpd.Series): if not allow_series: @@ -391,6 +464,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(mpd.__version__), + level="full", ) elif (cudf := get_cudf()) is not None and isinstance( native_object, cudf.Series @@ -406,6 +480,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(cudf.__version__), + level="full", ) elif (pa := get_pyarrow()) is not None and isinstance(native_object, pa.ChunkedArray): if not allow_series: @@ -417,6 +492,7 @@ def from_native( # noqa: PLR0915 ), is_polars=False, backend_version=parse_version(pa.__version__), + level="full", ) elif hasattr(native_object, "__narwhals_series__"): if not allow_series: @@ -424,7 +500,10 @@ def from_native( # noqa: PLR0915 raise TypeError(msg) # placeholder (0,) version here, as we wouldn't use it in this case anyway. return Series( - native_object.__narwhals_series__(), backend_version=(0,), is_polars=False + native_object.__narwhals_series__(), + backend_version=(0,), + is_polars=False, + level="full", ) elif strict: msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(native_object)}" @@ -455,6 +534,7 @@ def narwhalify( *, strict: bool = False, eager_only: bool | None = False, + eager_or_interchange_only: bool | None = False, series_only: bool | None = False, allow_series: bool | None = True, ) -> Callable[..., Any]: @@ -507,6 +587,8 @@ def func(df): strict: Whether to raise if object can't be converted or to just leave it as-is (default). eager_only: Whether to only allow eager objects. + eager_or_interchange_only: Whether to only allow eager objects or objects which + implement the Dataframe Interchange Protocol. series_only: Whether to only allow series. allow_series: Whether to allow series (default is only dataframe / lazyframe). """ @@ -519,6 +601,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: arg, strict=strict, eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, series_only=series_only, allow_series=allow_series, ) @@ -530,6 +613,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: value, strict=strict, eager_only=eager_only, + eager_or_interchange_only=eager_or_interchange_only, series_only=series_only, allow_series=allow_series, ) diff --git a/narwhals/typing.py b/narwhals/typing.py index 52a30edcbf..0d63cd0649 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -28,14 +28,19 @@ def columns(self) -> Any: ... def join(self, *args: Any, **kwargs: Any) -> Any: ... + class DataFrameLike(Protocol): + def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... + IntoExpr: TypeAlias = Union["Expr", str, "Series"] """Anything which can be converted to an expression.""" -IntoDataFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]"] +IntoDataFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]", "DataFrameLike"] """Anything which can be converted to a Narwhals DataFrame.""" -IntoFrame: TypeAlias = Union["NativeFrame", "DataFrame[Any]", "LazyFrame[Any]"] +IntoFrame: TypeAlias = Union[ + "NativeFrame", "DataFrame[Any]", "LazyFrame[Any]", "DataFrameLike" +] """Anything which can be converted to a Narwhals DataFrame or LazyFrame.""" Frame: TypeAlias = Union["DataFrame[Any]", "LazyFrame[Any]"] diff --git a/narwhals/utils.py b/narwhals/utils.py index 7f65d8bc03..ee19d4b6a4 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -306,6 +306,15 @@ def is_ordered_categorical(series: Series) -> bool: >>> func(s_pl) True """ + from narwhals._interchange.series import InterchangeSeries + + if ( + isinstance(series._compliant_series, InterchangeSeries) + and series.dtype == dtypes.Categorical + ): + return series._compliant_series._native_series.describe_categorical[ # type: ignore[no-any-return] + "is_ordered" + ] if series.dtype == dtypes.Enum: return True if series.dtype != dtypes.Categorical: diff --git a/requirements-dev.txt b/requirements-dev.txt index 0586d00c64..a9d6f04d89 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ covdefaults +ibis-framework pandas polars[timezones] pre-commit diff --git a/tests/frame/get_column_test.py b/tests/frame/get_column_test.py index bbfab018ba..eaf6d02c50 100644 --- a/tests/frame/get_column_test.py +++ b/tests/frame/get_column_test.py @@ -13,7 +13,9 @@ def test_get_column(constructor_with_pyarrow: Any) -> None: result = df.get_column("a") assert result.to_list() == [1, 2] assert result.name == "a" - with pytest.raises(TypeError): + with pytest.raises( + (KeyError, TypeError), match="Expected str|'int' object cannot be converted|0" + ): # Check that trying to get a column by position is disallowed here. nw.from_native(df, eager_only=True).get_column(0) # type: ignore[arg-type] diff --git a/tests/frame/interchange_schema_test.py b/tests/frame/interchange_schema_test.py new file mode 100644 index 0000000000..72d4158f9d --- /dev/null +++ b/tests/frame/interchange_schema_test.py @@ -0,0 +1,100 @@ +from datetime import date + +import ibis +import polars as pl +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + + +@pytest.mark.skipif( + parse_version(ibis.__version__) < (6, 0), + reason="too old, requires interchange protocol", +) +def test_interchange_schema() -> None: + df_pl = pl.DataFrame( + { + "a": [1, 1, 2], + "b": [4, 5, 6], + "c": [4, 5, 6], + "d": [4, 5, 6], + "e": [4, 5, 6], + "f": [4, 5, 6], + "g": [4, 5, 6], + "h": [4, 5, 6], + "i": [4, 5, 6], + "j": [4, 5, 6], + "k": ["fdafsd", "fdas", "ad"], + "l": ["fdafsd", "fdas", "ad"], + "m": [date(2021, 1, 1), date(2021, 1, 1), date(2021, 1, 1)], + "n": [True, True, False], + }, + schema={ + "a": pl.Int64, + "b": pl.Int32, + "c": pl.Int16, + "d": pl.Int8, + "e": pl.UInt64, + "f": pl.UInt32, + "g": pl.UInt16, + "h": pl.UInt8, + "i": pl.Float64, + "j": pl.Float32, + "k": pl.String, + "l": pl.Categorical, + "m": pl.Datetime, + "n": pl.Boolean, + }, + ) + tbl = ibis.memtable(df_pl) + df = nw.from_native(tbl, eager_or_interchange_only=True) + result = df.schema + expected = { + "a": nw.Int64, + "b": nw.Int32, + "c": nw.Int16, + "d": nw.Int8, + "e": nw.UInt64, + "f": nw.UInt32, + "g": nw.UInt16, + "h": nw.UInt8, + "i": nw.Float64, + "j": nw.Float32, + "k": nw.String, + "l": nw.String, # https://github.com/ibis-project/ibis/issues/9570 + "m": nw.Datetime, + "n": nw.Boolean, + } + assert result == expected + assert df["a"].dtype == nw.Int64 + + +@pytest.mark.skipif( + parse_version(ibis.__version__) < (6, 0), + reason="too old, requires interchange protocol", +) +def test_invalid() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + tbl = ibis.memtable(df) + with pytest.raises( + NotImplementedError, match="is not supported for metadata-only dataframes" + ): + nw.from_native(tbl, eager_or_interchange_only=True).select("a") + with pytest.raises(TypeError, match="Cannot only use `series_only=True`"): + nw.from_native(tbl, eager_only=True) + with pytest.raises(ValueError, match="Invalid parameter combination"): + nw.from_native(tbl, eager_only=True, eager_or_interchange_only=True) # type: ignore[call-overload] + + +@pytest.mark.skipif( + parse_version(ibis.__version__) < (6, 0), + reason="too old, requires interchange protocol", +) +def test_get_level() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + tbl = ibis.memtable(df) + assert ( + nw.get_level(nw.from_native(tbl, eager_or_interchange_only=True)) == "interchange" + ) + assert nw.get_level(nw.from_native(df)) == "full" diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index ebe7baccad..b9f23ad192 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -126,7 +126,7 @@ def test_left_join(constructor: Any) -> None: compare_dicts(result, expected) -@pytest.mark.filterwarnings("ignore: the defaultcoalesce behavior") +@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_multiple_column(constructor: Any) -> None: data_left = {"a": [1, 2, 3], "b": [4, 5, 6]} data_right = {"a": [1, 2, 3], "c": [4, 5, 6]} diff --git a/tests/frame/test_invalid.py b/tests/frame/test_invalid.py index 965a32f618..6983ff8ebc 100644 --- a/tests/frame/test_invalid.py +++ b/tests/frame/test_invalid.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import polars as pl import pytest @@ -27,14 +28,11 @@ def test_validate_laziness() -> None: @pytest.mark.skipif( - parse_version(pd.__version__) < parse_version("2.0.0"), reason="too old" + parse_version(np.__version__) < parse_version("1.26.4"), reason="too old" ) def test_memmap() -> None: # the headache this caused me... - try: - from sklearn.utils import check_X_y - except ImportError: # pragma: no cover - return + from sklearn.utils import check_X_y from sklearn.utils._testing import create_memmap_backed_data x_any = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) diff --git a/tests/series/is_ordered_categorical_test.py b/tests/series/is_ordered_categorical_test.py index 925989be7c..3da7e93e24 100644 --- a/tests/series/is_ordered_categorical_test.py +++ b/tests/series/is_ordered_categorical_test.py @@ -6,6 +6,7 @@ import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version def test_is_ordered_categorical() -> None: @@ -25,6 +26,18 @@ def test_is_ordered_categorical() -> None: assert not nw.is_ordered_categorical(nw.from_native(s, series_only=True)) +@pytest.mark.skipif( + parse_version(pd.__version__) < (2, 0), reason="requires interchange protocol" +) +def test_is_ordered_categorical_interchange_protocol() -> None: + df = pd.DataFrame( + {"a": ["a", "b"]}, dtype=pd.CategoricalDtype(ordered=True) + ).__dataframe__() + assert nw.is_ordered_categorical( + nw.from_native(df, eager_or_interchange_only=True)["a"] + ) + + def test_is_definitely_not_ordered_categorical( constructor_series_with_pyarrow: Any, ) -> None: diff --git a/tests/translate/from_native_test.py b/tests/translate/from_native_test.py index 5825f5927c..7e5b67b28e 100644 --- a/tests/translate/from_native_test.py +++ b/tests/translate/from_native_test.py @@ -7,6 +7,7 @@ import pyarrow as pa import pytest +import narwhals as unstable_nw import narwhals.stable.v1 as nw from tests.utils import maybe_get_modin_df @@ -178,7 +179,9 @@ def test_pandas_like_validate() -> None: ) def test_init_series(series: Any, is_polars: Any, context: Any) -> None: with context: - result = nw.Series(series, is_polars=is_polars, backend_version=(1, 2, 3)) + result = nw.Series( + series, is_polars=is_polars, backend_version=(1, 2, 3), level="full" + ) assert isinstance(result, nw.Series) @@ -216,7 +219,9 @@ def test_init_series(series: Any, is_polars: Any, context: Any) -> None: ) def test_init_eager(dframe: Any, is_polars: Any, context: Any) -> None: with context: - result = nw.DataFrame(dframe, is_polars=is_polars, backend_version=(1, 2, 3)) # type: ignore[var-annotated] + result = nw.DataFrame( + dframe, is_polars=is_polars, backend_version=(1, 2, 3), level="full" + ) # type: ignore[var-annotated] assert isinstance(result, nw.DataFrame) @@ -254,5 +259,25 @@ def test_init_eager(dframe: Any, is_polars: Any, context: Any) -> None: ) def test_init_lazy(dframe: Any, is_polars: Any, context: Any) -> None: with context: - result = nw.LazyFrame(dframe, is_polars=is_polars, backend_version=(1, 2, 3)) # type: ignore[var-annotated] + result = nw.LazyFrame( + dframe, is_polars=is_polars, backend_version=(1, 2, 3), level="full" + ) # type: ignore[var-annotated] assert isinstance(result, nw.LazyFrame) + + +def test_init_already_narwhals() -> None: + df = nw.from_native(pl.DataFrame({"a": [1, 2, 3]})) + result = nw.from_native(df) + assert result is df # type: ignore[comparison-overlap] + s = df["a"] + result_s = nw.from_native(s, allow_series=True) + assert result_s is s + + +def test_init_already_narwhals_unstable() -> None: + df = unstable_nw.from_native(pl.DataFrame({"a": [1, 2, 3]})) + result = unstable_nw.from_native(df) + assert result is df # type: ignore[comparison-overlap] + s = df["a"] + result_s = unstable_nw.from_native(s, allow_series=True) + assert result_s is s diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index b6bede4b43..2d1c36b8bd 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -37,7 +37,9 @@ top_level_functions = [ i - for i in nw.DataFrame(pl.DataFrame(), is_polars=True, backend_version=(0,)).__dir__() + for i in nw.DataFrame( + pl.DataFrame(), is_polars=True, backend_version=(0,), level="full" + ).__dir__() if not i[0].isupper() and i[0] != "_" ] with open("docs/api-reference/dataframe.md") as fd: @@ -58,7 +60,9 @@ top_level_functions = [ i - for i in nw.LazyFrame(pl.LazyFrame(), is_polars=True, backend_version=(0,)).__dir__() + for i in nw.LazyFrame( + pl.LazyFrame(), is_polars=True, backend_version=(0,), level="full" + ).__dir__() if not i[0].isupper() and i[0] != "_" ] with open("docs/api-reference/lazyframe.md") as fd: @@ -79,7 +83,9 @@ top_level_functions = [ i - for i in nw.Series(pl.Series(), backend_version=(1,), is_polars=True).__dir__() + for i in nw.Series( + pl.Series(), backend_version=(1,), is_polars=True, level="full" + ).__dir__() if not i[0].isupper() and i[0] != "_" ] with open("docs/api-reference/series.md") as fd: @@ -135,7 +141,9 @@ expr = [i for i in nw.Expr(lambda: 0).__dir__() if not i[0].isupper() and i[0] != "_"] series = [ i - for i in nw.Series(pl.Series(), backend_version=(1,), is_polars=True).__dir__() + for i in nw.Series( + pl.Series(), backend_version=(1,), is_polars=True, level="full" + ).__dir__() if not i[0].isupper() and i[0] != "_" ] diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py index 0cd6335b51..47ac709b2a 100644 --- a/utils/check_backend_completeness.py +++ b/utils/check_backend_completeness.py @@ -89,20 +89,29 @@ def name(self): no_longer_missing = [] df_pa = ArrowDataFrame(MockDataFrame({"a": [1, 2, 3]}), backend_version=(13, 0)) - df_pd = nw.DataFrame( - MockDataFrame({"a": [1, 2, 3]}), is_polars=True, backend_version=(1,) + df_nw = nw.DataFrame( + MockDataFrame({"a": [1, 2, 3]}), + is_polars=True, + backend_version=(1,), + level="full", ) pa_methods = [f"DataFrame.{x}" for x in df_pa.__dir__() if not x.startswith("_")] - pd_methods = [f"DataFrame.{x}" for x in df_pd.__dir__() if not x.startswith("_")] - missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING]) - no_longer_missing.extend([x for x in MISSING if x in pa_methods and x in pd_methods]) + nw_methods = [f"DataFrame.{x}" for x in df_nw.__dir__() if not x.startswith("_")] + missing.extend( + [ + x + for x in nw_methods + if x not in pa_methods and x not in MISSING and x not in {"level"} + ] + ) + no_longer_missing.extend([x for x in MISSING if x in pa_methods and x in nw_methods]) ser_pa = df_pa["a"] - ser_pd = df_pd["a"] + ser_pd = df_nw["a"] pa_methods = [f"Series.{x}" for x in ser_pa.__dir__() if not x.startswith("_")] - pd_methods = [f"Series.{x}" for x in ser_pd.__dir__() if not x.startswith("_")] - missing.extend([x for x in pd_methods if x not in pa_methods and x not in MISSING]) - no_longer_missing.extend([x for x in MISSING if x in pa_methods and x in pd_methods]) + nw_methods = [f"Series.{x}" for x in ser_pd.__dir__() if not x.startswith("_")] + missing.extend([x for x in nw_methods if x not in pa_methods and x not in MISSING]) + no_longer_missing.extend([x for x in MISSING if x in pa_methods and x in nw_methods]) if missing: print( diff --git a/utils/generate_random_versions.py b/utils/generate_random_versions.py index ecd483db39..ecb709c1a0 100644 --- a/utils/generate_random_versions.py +++ b/utils/generate_random_versions.py @@ -1,7 +1,7 @@ import random PANDAS_AND_NUMPY_VERSION = [ - ("1.0.5", "1.18.5"), + # ("1.0.5", "1.18.5"), # fails to build in CI # noqa: ERA001 ("1.1.5", "1.19.5"), ("1.2.5", "1.20.3"), ("1.3.5", "1.21.6"),