Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make pandas and NumPy optional dependencies, don't require PyArrow for plotting with Polars/Modin/cuDF #3452

Merged
merged 24 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions altair/_magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import IPython
from IPython.core import magic_arguments
import pandas as pd
from narwhals.dependencies import is_pandas_dataframe as _is_pandas_dataframe

from altair.vegalite import v5 as vegalite_v5

Expand Down Expand Up @@ -39,7 +39,7 @@ def _prepare_data(data, data_transformers):
"""Convert input data to data for use within schema"""
if data is None or isinstance(data, dict):
return data
elif isinstance(data, pd.DataFrame):
elif _is_pandas_dataframe(data):
if func := data_transformers.get():
data = func(data)
return data
Expand Down
12 changes: 6 additions & 6 deletions altair/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .core import (
infer_vegalite_type,
infer_vegalite_type_for_pandas,
infer_encoding_types,
sanitize_dataframe,
sanitize_arrow_table,
sanitize_pandas_dataframe,
sanitize_narwhals_dataframe,
parse_shorthand,
use_signature,
update_nested,
Expand All @@ -23,10 +23,10 @@
"Undefined",
"display_traceback",
"infer_encoding_types",
"infer_vegalite_type",
"infer_vegalite_type_for_pandas",
"parse_shorthand",
"sanitize_arrow_table",
"sanitize_dataframe",
"sanitize_narwhals_dataframe",
"sanitize_pandas_dataframe",
"spec_to_html",
"update_nested",
"use_signature",
Expand Down
14 changes: 8 additions & 6 deletions altair/utils/_vegafusion_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
Callable,
)

import narwhals.stable.v1 as nw

from altair.utils._importers import import_vegafusion
from altair.utils.core import DataFrameLike
from altair.utils.data import (
DataType,
ToValuesReturnType,
Expand All @@ -22,9 +23,9 @@
)
from altair.vegalite.data import default_data_transformer


if TYPE_CHECKING:
import pandas as pd
from narwhals.typing import IntoDataFrame
from altair.utils.core import DataFrameLike
from vegafusion.runtime import ChartState # type: ignore

# Temporary storage for dataframes that have been extracted
Expand Down Expand Up @@ -60,7 +61,7 @@ def vegafusion_data_transformer(

@overload
def vegafusion_data_transformer(
data: dict | pd.DataFrame | SupportsGeoInterface, max_rows: int = ...
data: dict | IntoDataFrame | SupportsGeoInterface, max_rows: int = ...
) -> _VegaFusionReturnType: ...


Expand All @@ -70,9 +71,10 @@ def vegafusion_data_transformer(
"""VegaFusion Data Transformer"""
if data is None:
return vegafusion_data_transformer
elif isinstance(data, DataFrameLike) and not isinstance(data, SupportsGeoInterface):
elif isinstance(data, nw.DataFrame) and not isinstance(data, SupportsGeoInterface):
table_name = f"table_{uuid.uuid4()}".replace("-", "_")
extracted_inline_tables[table_name] = data
# vegafusion doesn't support Narwhals, so we extract the native object.
extracted_inline_tables[table_name] = nw.to_native(data)
return {"url": VEGAFUSION_PREFIX + table_name}
else:
# Use default transformer for geo interface objects
Expand Down
180 changes: 90 additions & 90 deletions altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,10 @@
from operator import itemgetter

import jsonschema
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype
import narwhals.stable.v1 as nw
from narwhals.dependencies import is_pandas_dataframe as _is_pandas_dataframe

from altair.utils.schemapi import SchemaBase, Undefined
from altair.utils._dfi_types import Column, DtypeKind, DataFrame as DfiDataFrame

if sys.version_info >= (3, 10):
from typing import ParamSpec
Expand All @@ -43,8 +41,11 @@
if TYPE_CHECKING:
from types import ModuleType
import typing as t
from pandas.core.interchange.dataframe_protocol import Column as PandasColumn
import pyarrow as pa
from altair.vegalite.v5.schema._typing import StandardType_T as InferredVegaLiteType
from altair.utils._dfi_types import DataFrame as DfiDataFrame
from altair.utils.data import DataType
from narwhals.typing import IntoExpr, IntoDataFrameT
import pandas as pd

V = TypeVar("V")
P = ParamSpec("P")
Expand Down Expand Up @@ -198,10 +199,7 @@ def __dataframe__(
]


InferredVegaLiteType = Literal["ordinal", "nominal", "quantitative", "temporal"]


def infer_vegalite_type(
def infer_vegalite_type_for_pandas(
data: object,
) -> InferredVegaLiteType | tuple[InferredVegaLiteType, list[Any]]:
"""
Expand All @@ -212,6 +210,9 @@ def infer_vegalite_type(
----------
data: object
"""
# This is safe to import here, as this function is only called on pandas input.
from pandas.api.types import infer_dtype

typ = infer_dtype(data, skipna=False)

if typ in {
Expand Down Expand Up @@ -297,13 +298,16 @@ def sanitize_geo_interface(geo: t.MutableMapping[Any, Any]) -> dict[str, Any]:


def numpy_is_subtype(dtype: Any, subtype: Any) -> bool:
# This is only called on `numpy` inputs, so it's safe to import it here.
import numpy as np

try:
return np.issubdtype(dtype, subtype)
except (NotImplementedError, TypeError):
return False


def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
def sanitize_pandas_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Sanitize a DataFrame to prepare it for serialization.

* Make a copy
Expand All @@ -320,6 +324,11 @@ def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
* convert dedicated string column to objects and replace NaN with None
* Raise a ValueError for TimeDelta dtypes
"""
# This is safe to import here, as this function is only called on pandas input.
# NumPy is a required dependency of pandas so is also safe to import.
import pandas as pd
import numpy as np

df = df.copy()

if isinstance(df.columns, pd.RangeIndex):
Expand Down Expand Up @@ -429,30 +438,57 @@ def to_list_if_array(val):
return df


def sanitize_arrow_table(pa_table: pa.Table) -> pa.Table:
"""Sanitize arrow table for JSON serialization"""
import pyarrow as pa
import pyarrow.compute as pc

arrays = []
schema = pa_table.schema
for name in schema.names:
array = pa_table[name]
dtype_name = str(schema.field(name).type)
if dtype_name.startswith(("timestamp", "date")):
arrays.append(pc.strftime(array))
elif dtype_name.startswith("duration"):
def sanitize_narwhals_dataframe(
data: nw.DataFrame[IntoDataFrameT],
) -> nw.DataFrame[IntoDataFrameT]:
"""Sanitize narwhals.DataFrame for JSON serialization"""
schema = data.schema
columns: list[IntoExpr] = []
# See https://github.com/vega/altair/issues/1027 for why this is necessary.
local_iso_fmt_string = "%Y-%m-%dT%H:%M:%S"
for name, dtype in schema.items():
if dtype == nw.Date:
# Polars doesn't allow formatting `Date` with time directives.
# The date -> datetime cast is extremely fast compared with `to_string`
columns.append(
nw.col(name).cast(nw.Datetime).dt.to_string(local_iso_fmt_string)
)
elif dtype == nw.Datetime:
columns.append(nw.col(name).dt.to_string(f"{local_iso_fmt_string}%.f"))
elif dtype == nw.Duration:
msg = (
f'Field "{name}" has type "{dtype_name}" which is '
f'Field "{name}" has type "{dtype}" which is '
"not supported by Altair. Please convert to "
"either a timestamp or a numerical value."
""
)
raise ValueError(msg)
else:
arrays.append(array)
columns.append(name)
return data.select(columns)

return pa.Table.from_arrays(arrays, names=schema.names)

def narwhalify(data: DataType) -> nw.DataFrame[Any]:
"""Wrap `data` in `narwhals.DataFrame`.

If `data` is not supported by Narwhals, but it is convertible
to a PyArrow table, then first convert to a PyArrow Table,
and then wrap in `narwhals.DataFrame`.
"""
if isinstance(data, nw.DataFrame):
# Early return if already a Narwhals DataFrame
return data
# Using `strict=False` will return `data` as-is if the object cannot be converted.
data = nw.from_native(data, eager_only=True, strict=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does narwhals do with geopandas DataFrames? Are these just treated the same as pandas?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now they're going down different paths:

https://github.com/MarcoGorelli/altair/blob/5d54bc4263dd5bdba600819b316dda8c592bdd8f/altair/utils/data.py#L316-L334

Maybe this can all be unified further, although the pandas/geointerface paths are doing some very library-specific things

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so they get wrapped by narwhals fine (just as normal pandas DataFrames), but then we use from_native whenever we need to be able to tell the difference between a GeoDataFrame and a regular DataFrame?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's right!

it's a (practically) free operation, narwhals.DataFrame just holds a reference to the original dataframe, there's no copying nor conversion involved - getting the original dataframe out again is just a matter of accessing 1-2 properties

https://github.com/narwhals-dev/narwhals/blob/a5276cd6e80781c61143c71041db81bd700a0e12/narwhals/translate.py#L41-L75

if isinstance(data, nw.DataFrame):
return data
if isinstance(data, DataFrameLike):
from altair.utils.data import arrow_table_from_dfi_dataframe

pa_table = arrow_table_from_dfi_dataframe(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, can polars import from the DataFrame interchange protocol without going through pyarrow? If so, it might be nice to do this when polars is installed but not pyarrow (e.g. if someone somehow manages to have ibis and polars installed but not pyarrow).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible, although pyarrow is a required dependency of ibis anyway

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a priority now (and I can't think of any specific scenarios that would benefit), was just curious.

For posterity, this ibis team is in the process of removing the hard pyarrow dependency for some use cases (ibis-project/ibis#9552), but it will still be required for many (all?) specific backends I think.

Copy link
Contributor Author

@MarcoGorelli MarcoGorelli Jul 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

niiice! looking forwards to being able to use ibis as a duckdb frontend without pandas+pyarrow being required 🤞 well done to ibis devs

return nw.from_native(pa_table, eager_only=True)
msg = f"Unsupported data type: {type(data)}"
raise TypeError(msg)


def parse_shorthand(
Expand Down Expand Up @@ -498,6 +534,7 @@ def parse_shorthand(

Examples
--------
>>> import pandas as pd
>>> data = pd.DataFrame({'foo': ['A', 'B', 'A', 'B'],
... 'bar': [1, 2, 3, 4]})

Expand Down Expand Up @@ -537,7 +574,7 @@ def parse_shorthand(
>>> parse_shorthand('count()', data) == {'aggregate': 'count', 'type': 'quantitative'}
True
"""
from altair.utils._importers import pyarrow_available
from altair.utils.data import is_data_type

if not shorthand:
return {}
Expand Down Expand Up @@ -597,39 +634,20 @@ def parse_shorthand(
attrs["type"] = "temporal"

# if data is specified and type is not, infer type from data
if "type" not in attrs:
if pyarrow_available() and data is not None and isinstance(data, DataFrameLike):
dfi = data.__dataframe__()
if "field" in attrs:
unescaped_field = attrs["field"].replace("\\", "")
if unescaped_field in dfi.column_names():
column = dfi.get_column_by_name(unescaped_field)
try:
attrs["type"] = infer_vegalite_type_for_dfi_column(column)
except (NotImplementedError, AttributeError, ValueError):
# Fall back to pandas-based inference.
# Note: The AttributeError catch is a workaround for
# https://github.com/pandas-dev/pandas/issues/55332
if isinstance(data, pd.DataFrame):
attrs["type"] = infer_vegalite_type(data[unescaped_field])
else:
raise

if isinstance(attrs["type"], tuple):
attrs["sort"] = attrs["type"][1]
attrs["type"] = attrs["type"][0]
elif isinstance(data, pd.DataFrame):
# Fallback if pyarrow is not installed or if pandas is older than 1.5
#
# Remove escape sequences so that types can be inferred for columns with special characters
if "field" in attrs and attrs["field"].replace("\\", "") in data.columns:
attrs["type"] = infer_vegalite_type(
data[attrs["field"].replace("\\", "")]
)
# ordered categorical dataframe columns return the type and sort order as a tuple
if isinstance(attrs["type"], tuple):
attrs["sort"] = attrs["type"][1]
attrs["type"] = attrs["type"][0]
if "type" not in attrs and is_data_type(data):
unescaped_field = attrs["field"].replace("\\", "")
data_nw = narwhalify(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something important that I think we're losing here is that previously we were checking the types of columns in DataFrame Interchange Protocol DataFrames without loading them all into memory as pyarrow (which is what narwhalify does for types that narwhals doesn't support). Ibis has an optimization where calling dfi.get_column_by_name is a metadata-only operation that doesn't trigger loading all columns into memory.

When used with plain Altair, the full dataset is loaded into memory anyway during the to_values() calculation. But when the "vegafusion" data transformer is enabled, VegaFusion is able to detect which columns are needed and it will request only the required columns when loading into pyarrow (See here).

So unless/until narwhals can support wrapping DataFrame Interchange Protocol DataFrames directly, I think we need to avoid the arrow conversion here and fall back to the legacy infer_vegalite_type_for_dfi_column behavior for this case.

if unescaped_field in data_nw.columns:
column = data_nw[unescaped_field]
if column.dtype in {nw.Object, nw.Unknown} and _is_pandas_dataframe(
nw.to_native(data_nw)
):
attrs["type"] = infer_vegalite_type_for_pandas(nw.to_native(column))
else:
attrs["type"] = infer_vegalite_type_for_narwhals(column)
if isinstance(attrs["type"], tuple):
attrs["sort"] = attrs["type"][1]
attrs["type"] = attrs["type"][0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really nice cleanup!


# If an unescaped colon is still present, it's often due to an incorrect data type specification
# but could also be due to using a column name with ":" in it.
Expand All @@ -650,41 +668,23 @@ def parse_shorthand(
return attrs


def infer_vegalite_type_for_dfi_column(
column: Column | PandasColumn,
) -> InferredVegaLiteType | tuple[InferredVegaLiteType, list[Any]]:
from pyarrow.interchange.from_dataframe import column_to_array

try:
kind = column.dtype[0]
except NotImplementedError as e:
# Edge case hack:
# dtype access fails for pandas column with datetime64[ns, UTC] type,
# but all we need to know is that its temporal, so check the
# error message for the presence of datetime64.
#
# See https://github.com/pandas-dev/pandas/issues/54239
if "datetime64" in e.args[0] or "timestamp" in e.args[0]:
return "temporal"
raise e

def infer_vegalite_type_for_narwhals(
column: nw.Series,
) -> InferredVegaLiteType | tuple[InferredVegaLiteType, list]:
dtype = column.dtype
if (
kind == DtypeKind.CATEGORICAL
and column.describe_categorical["is_ordered"]
and column.describe_categorical["categories"] is not None
nw.is_ordered_categorical(column)
and not (categories := column.cat.get_categories()).is_empty()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, nice. I had missed that dropping Python 3.7 means we can use the walrus operator!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😄 there's a few others if you're brave enough to try out auto-walrus

):
# Treat ordered categorical column as Vega-Lite ordinal
categories_column = column.describe_categorical["categories"]
categories_array = column_to_array(categories_column)
return "ordinal", categories_array.to_pylist()
if kind in {DtypeKind.STRING, DtypeKind.CATEGORICAL, DtypeKind.BOOL}:
return "ordinal", categories.to_list()
if dtype in {nw.String, nw.Categorical, nw.Boolean}:
return "nominal"
elif kind in {DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT}:
elif dtype.is_numeric():
return "quantitative"
elif kind == DtypeKind.DATETIME:
elif dtype in {nw.Datetime, nw.Date}:
return "temporal"
else:
msg = f"Unexpected DtypeKind: {kind}"
msg = f"Unexpected DtypeKind: {dtype}"
raise ValueError(msg)


Expand Down
Loading