Skip to content

Commit

Permalink
perf: use fastpath in DataFrame.to_numpy for pandas, improve perfor…
Browse files Browse the repository at this point in the history
…mance for `DataFrame.schema` for pandas, use fewer values to sniff dtype for pandas objects (#1929)
  • Loading branch information
MarcoGorelli authored Feb 5, 2025
1 parent 71a5bc5 commit a6479aa
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 106 deletions.
81 changes: 40 additions & 41 deletions .github/workflows/downstream_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,48 +151,47 @@ jobs:
cd scikit-lego
pytest -n auto --disable-warnings --cov=sklego -m "not cvxpy and not formulaic and not umap"
# temporarily un-enabled as it's been failing for some time due to unrelated reasons
# shiny:
# strategy:
# matrix:
# python-version: ["3.12"]
# os: [ubuntu-latest]
shiny:
strategy:
matrix:
python-version: ["3.12"]
os: [ubuntu-latest]

# runs-on: ${{ matrix.os }}
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install uv
# uses: astral-sh/setup-uv@v5
# with:
# enable-cache: "true"
# cache-suffix: ${{ matrix.python-version }}
# cache-dependency-glob: "pyproject.toml"
# - name: clone-shiny
# run: |
# git clone https://github.com/posit-dev/py-shiny.git
# cd py-shiny
# git log
# - name: install-basics
# run: uv pip install --upgrade tox virtualenv setuptools --system
# - name: install-shiny-dev
# env:
# UV_SYSTEM_PYTHON: 1
# run: |
# cd py-shiny
# make narwhals-install-shiny
# - name: install-narwhals-dev
# run: |
# uv pip uninstall narwhals --system
# uv pip install -e . --system
# - name: show-deps
# run: uv pip freeze
# - name: Run `make narwhals-test-integration`
# run: |
# cd py-shiny
# make narwhals-test-integration
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: "true"
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: clone-shiny
run: |
git clone https://github.com/posit-dev/py-shiny.git
cd py-shiny
git log
- name: install-basics
run: uv pip install --upgrade tox virtualenv setuptools --system
- name: install-shiny-dev
env:
UV_SYSTEM_PYTHON: 1
run: |
cd py-shiny
make narwhals-install-shiny
- name: install-narwhals-dev
run: |
uv pip uninstall narwhals --system
uv pip install -e . --system
- name: show-deps
run: uv pip freeze
- name: Run `make narwhals-test-integration`
run: |
cd py-shiny
make narwhals-test-integration
tea-tasting:
strategy:
Expand Down
3 changes: 3 additions & 0 deletions docs/backcompat.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ before making any change.

The following are differences between the main Narwhals namespace and `narwhals.stable.v1`:

- Since Narwhals 1.24.1, an empty or all-null object-dtype pandas Series is inferred to
be of dtype `String`. Previously, it would have been inferred as `Object`.

- Since Narwhals 1.23:

- Passing an `ibis.Table` to `from_native` returns a `LazyFrame`. In
Expand Down
3 changes: 2 additions & 1 deletion narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,10 @@ def drop_nulls(self: Self, subset: list[str] | None) -> Self:

@property
def schema(self: Self) -> dict[str, DType]:
native_dtypes = self._native_frame.dtypes
return {
col: native_to_narwhals_dtype(
self._native_frame[col], self._version, self._implementation
native_dtypes[col], self._version, self._implementation
)
for col in self._native_frame.columns
}
Expand Down
6 changes: 4 additions & 2 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def median(self: Self) -> Self:
from narwhals.exceptions import InvalidOperationError

def func(s: dx.Series) -> dx.Series:
dtype = native_to_narwhals_dtype(s, self._version, Implementation.DASK)
dtype = native_to_narwhals_dtype(s.dtype, self._version, Implementation.DASK)
if not dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
Expand Down Expand Up @@ -553,7 +553,9 @@ def is_null(self: Self) -> Self:

def is_nan(self: Self) -> Self:
def func(_input: dx.Series) -> dx.Series:
dtype = native_to_narwhals_dtype(_input, self._version, self._implementation)
dtype = native_to_narwhals_dtype(
_input.dtype, self._version, self._implementation
)
if dtype.is_numeric():
return _input != _input # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?"
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_dask/expr_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def replace_time_zone(self: Self, time_zone: str | None) -> DaskExpr:
def convert_time_zone(self: Self, time_zone: str) -> DaskExpr:
def func(s: dx.Series, time_zone: str) -> dx.Series:
dtype = native_to_narwhals_dtype(
s, self._compliant_expr._version, Implementation.DASK
s.dtype, self._compliant_expr._version, Implementation.DASK
)
if dtype.time_zone is None: # type: ignore[attr-defined]
return s.dt.tz_localize("UTC").dt.tz_convert(time_zone)
Expand All @@ -148,7 +148,7 @@ def func(s: dx.Series, time_zone: str) -> dx.Series:
def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> DaskExpr:
def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"]) -> dx.Series:
dtype = native_to_narwhals_dtype(
s, self._compliant_expr._version, Implementation.DASK
s.dtype, self._compliant_expr._version, Implementation.DASK
)
is_pyarrow_dtype = "pyarrow" in str(dtype)
mask_na = s.isna()
Expand Down
58 changes: 42 additions & 16 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@
from typing import Sequence
from typing import overload

import numpy as np

from narwhals._expression_parsing import evaluate_into_exprs
from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING
from narwhals._pandas_like.series import PandasLikeSeries
from narwhals._pandas_like.utils import broadcast_and_extract_dataframe_comparand
from narwhals._pandas_like.utils import broadcast_series
from narwhals._pandas_like.utils import check_column_names_are_unique
from narwhals._pandas_like.utils import convert_str_slice_to_int_slice
from narwhals._pandas_like.utils import create_compliant_series
from narwhals._pandas_like.utils import horizontal_concat
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals._pandas_like.utils import object_native_to_narwhals_dtype
from narwhals._pandas_like.utils import pivot_table
from narwhals._pandas_like.utils import rename
from narwhals._pandas_like.utils import select_columns_by_name
Expand All @@ -36,14 +41,12 @@
from pathlib import Path
from types import ModuleType

import numpy as np
import pandas as pd
import polars as pl
from typing_extensions import Self

from narwhals._pandas_like.group_by import PandasLikeGroupBy
from narwhals._pandas_like.namespace import PandasLikeNamespace
from narwhals._pandas_like.series import PandasLikeSeries
from narwhals._pandas_like.typing import IntoPandasLikeExpr
from narwhals.dtypes import DType
from narwhals.typing import SizeUnit
Expand All @@ -52,6 +55,31 @@
from narwhals.typing import CompliantDataFrame
from narwhals.typing import CompliantLazyFrame

CLASSICAL_NUMPY_DTYPES = frozenset(
[
np.dtype("float64"),
np.dtype("float32"),
np.dtype("int64"),
np.dtype("int32"),
np.dtype("int16"),
np.dtype("int8"),
np.dtype("uint64"),
np.dtype("uint32"),
np.dtype("uint16"),
np.dtype("uint8"),
np.dtype("bool"),
np.dtype("datetime64[s]"),
np.dtype("datetime64[ms]"),
np.dtype("datetime64[us]"),
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[s]"),
np.dtype("timedelta64[ms]"),
np.dtype("timedelta64[us]"),
np.dtype("timedelta64[ns]"),
np.dtype("object"),
]
)


class PandasLikeDataFrame(CompliantDataFrame, CompliantLazyFrame):
# --- not in the spec ---
Expand Down Expand Up @@ -120,8 +148,6 @@ def _from_native_frame(
)

def get_column(self: Self, name: str) -> PandasLikeSeries:
from narwhals._pandas_like.series import PandasLikeSeries

return PandasLikeSeries(
self._native_frame[name],
implementation=self._implementation,
Expand Down Expand Up @@ -179,8 +205,6 @@ def __getitem__(
item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) # type: ignore[assignment]

if isinstance(item, str):
from narwhals._pandas_like.series import PandasLikeSeries

return PandasLikeSeries(
self._native_frame[item],
implementation=self._implementation,
Expand Down Expand Up @@ -238,8 +262,6 @@ def __getitem__(
raise TypeError(msg) # pragma: no cover

elif isinstance(item, tuple) and len(item) == 2:
from narwhals._pandas_like.series import PandasLikeSeries

if isinstance(item[1], str):
item = (item[0], self._native_frame.columns.get_loc(item[1])) # type: ignore[assignment]
native_series = self._native_frame.iloc[item]
Expand Down Expand Up @@ -344,8 +366,13 @@ def iter_rows(

@property
def schema(self: Self) -> dict[str, DType]:
native_dtypes = self._native_frame.dtypes
return {
col: native_to_narwhals_dtype(
native_dtypes[col], self._version, self._implementation
)
if native_dtypes[col] != "object"
else object_native_to_narwhals_dtype(
self._native_frame[col], self._version, self._implementation
)
for col in self._native_frame.columns
Expand Down Expand Up @@ -820,8 +847,6 @@ def shape(self: Self) -> tuple[int, int]:
return self._native_frame.shape # type: ignore[no-any-return]

def to_dict(self: Self, *, as_series: bool) -> dict[str, Any]:
from narwhals._pandas_like.series import PandasLikeSeries

if as_series:
return {
col: PandasLikeSeries(
Expand All @@ -835,7 +860,12 @@ def to_dict(self: Self, *, as_series: bool) -> dict[str, Any]:
return self._native_frame.to_dict(orient="list") # type: ignore[no-any-return]

def to_numpy(self: Self, dtype: Any = None, copy: bool | None = None) -> np.ndarray:
from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING
native_dtypes = self._native_frame.dtypes
if native_dtypes.isin(CLASSICAL_NUMPY_DTYPES).all():
# Fast path, no conversions necessary.
if dtype is not None:
return self._native_frame.to_numpy(dtype=dtype, copy=copy)
return self._native_frame.to_numpy(copy=copy)

if copy is None:
# pandas default differs from Polars, but cuDF default is True
Expand Down Expand Up @@ -865,7 +895,7 @@ def to_numpy(self: Self, dtype: Any = None, copy: bool | None = None) -> np.ndar
# so we cast each Series to numpy and let numpy find a common dtype.
# If there aren't any dtypes where `to_numpy()` is "broken" (i.e. it
# returns Object) then we just call `to_numpy()` on the DataFrame.
for col_dtype in df.dtypes:
for col_dtype in native_dtypes:
if str(col_dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING:
import numpy as np

Expand Down Expand Up @@ -913,8 +943,6 @@ def write_csv(self: Self, file: str | Path | BytesIO | None) -> str | None:

# --- descriptive ---
def is_duplicated(self: Self) -> PandasLikeSeries:
from narwhals._pandas_like.series import PandasLikeSeries

return PandasLikeSeries(
self._native_frame.duplicated(keep=False),
implementation=self._implementation,
Expand All @@ -926,8 +954,6 @@ def is_empty(self: Self) -> bool:
return self._native_frame.empty # type: ignore[no-any-return]

def is_unique(self: Self) -> PandasLikeSeries:
from narwhals._pandas_like.series import PandasLikeSeries

return PandasLikeSeries(
~self._native_frame.duplicated(keep=False),
implementation=self._implementation,
Expand Down
10 changes: 8 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from narwhals._pandas_like.utils import narwhals_to_native_dtype
from narwhals._pandas_like.utils import native_series_from_iterable
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals._pandas_like.utils import object_native_to_narwhals_dtype
from narwhals._pandas_like.utils import rename
from narwhals._pandas_like.utils import select_columns_by_name
from narwhals._pandas_like.utils import set_index
Expand Down Expand Up @@ -179,8 +180,13 @@ def shape(self: Self) -> tuple[int]:

@property
def dtype(self: Self) -> DType:
return native_to_narwhals_dtype(
self._native_series, self._version, self._implementation
native_dtype = self._native_series.dtype
return (
native_to_narwhals_dtype(native_dtype, self._version, self._implementation)
if native_dtype != "object"
else object_native_to_narwhals_dtype(
self._native_series, self._version, self._implementation
)
)

def ewm_mean(
Expand Down
Loading

0 comments on commit a6479aa

Please sign in to comment.