From 836b8cfeacb0e280d5373efeb77b039e93de41d6 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 24 Jul 2023 16:30:17 -0400 Subject: [PATCH] Support for type inference for DataFrames using the DataFrame Interchange Protocol (#3114) * Add DataFrame Interchange Protocol types from spec for type checking * Tests shouldn't fail when pyarrow and vegafusion aren't installed, skip instead * Compute parse_shorthand based on __dataframe__ when possible If the pyarrow data interchange module is available and the dataset has a __dataframe__ method, infer column types based on the DataFrame Interchange Protocol. Fall back to pandas implementation for pandas DataFrames if pyarrow is not available or pandas is older that 1.5. * Add comment explaining why we uninstall optional dependencies * Add pandas GitHub issue reference * Add changelog entry --- .github/workflows/build.yml | 8 + altair/utils/_dfi_types.py | 508 ++++++++++++++++++++++ altair/utils/core.py | 76 +++- altair/utils/data.py | 8 + doc/releases/changes.rst | 1 + tests/test_transformed_data.py | 8 + tests/utils/test_dataframe_interchange.py | 8 +- tests/utils/test_mimebundle.py | 7 + 8 files changed, 613 insertions(+), 11 deletions(-) create mode 100644 altair/utils/_dfi_types.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc4774cb3..75aa922c1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,6 +28,14 @@ jobs: pip install .[dev] # pip install "selenium<4.3.0" # pip install altair_saver + - name: Maybe uninstall optional dependencies + # We uninstall pyarrow and vegafusion for one job to test that we have not + # accidentally introduced a hard dependency on these libraries. + # Uninstalling for Python 3.9 is an arbitrary choice. + # Also see https://github.com/altair-viz/altair/pull/3114 + if: ${{ matrix.python-version }}=="3.9" + run: | + pip uninstall -y pyarrow vegafusion vegafusion-python-embed - name: Test that schema generation has no effect run: | python tools/generate_schema_wrapper.py diff --git a/altair/utils/_dfi_types.py b/altair/utils/_dfi_types.py new file mode 100644 index 000000000..16b83fb4d --- /dev/null +++ b/altair/utils/_dfi_types.py @@ -0,0 +1,508 @@ +# DataFrame Interchange Protocol Types +# Copied from https://data-apis.org/dataframe-protocol/latest/API.html +# +# These classes are only for use in type signatures +from abc import ( + ABC, + abstractmethod, +) +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) + + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple["Buffer", Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple["Buffer", Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: "Optional[Column]" + + +class Buffer(ABC): + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + @property + @abstractmethod + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + pass + + @property + @abstractmethod + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + pass + + @abstractmethod + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + @abstractmethod + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + pass + + +class Column(ABC): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + @abstractmethod + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + pass + + @property + @abstractmethod + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + pass + + @property + @abstractmethod + def dtype(self) -> Dtype: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + @property + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. + + TBD: are there any other in-memory representations that are needed? + """ + pass + + @property + @abstractmethod + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + """ + pass + + @property + @abstractmethod + def null_count(self) -> Optional[int]: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + @abstractmethod + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + pass + + +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass + + +class DataFrame(ABC): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> "DataFrame": + """ + Construct a new exchange object, potentially changing the parameters. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + """ + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + pass + + @abstractmethod + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + pass + + @abstractmethod + def num_rows(self) -> Optional[int]: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + pass + + @abstractmethod + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + pass + + @abstractmethod + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + pass + + @abstractmethod + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + pass + + @abstractmethod + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + pass + + @abstractmethod + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + pass + + @abstractmethod + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ + pass diff --git a/altair/utils/core.py b/altair/utils/core.py index 4a375d254..1d4d6f17b 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -15,8 +15,10 @@ import jsonschema import pandas as pd import numpy as np +from pandas.core.interchange.dataframe_protocol import Column as PandasColumn from altair.utils.schemapi import SchemaBase +from altair.utils._dfi_types import Column, DtypeKind, DataFrame as DfiDataFrame if sys.version_info >= (3, 10): from typing import ParamSpec @@ -36,7 +38,7 @@ class _DataFrameLike(Protocol): - def __dataframe__(self, *args, **kwargs): + def __dataframe__(self, *args, **kwargs) -> DfiDataFrame: ... @@ -436,7 +438,7 @@ def sanitize_arrow_table(pa_table): def parse_shorthand( shorthand: Union[Dict[str, Any], str], - data: Optional[pd.DataFrame] = None, + data: Optional[Union[pd.DataFrame, _DataFrameLike]] = None, parse_aggregates: bool = True, parse_window_ops: bool = False, parse_timeunits: bool = True, @@ -516,6 +518,8 @@ def parse_shorthand( >>> parse_shorthand('count()', data) == {'aggregate': 'count', 'type': 'quantitative'} True """ + from altair.utils.data import pyarrow_available + if not shorthand: return {} @@ -574,14 +578,29 @@ def parse_shorthand( attrs["type"] = "temporal" # if data is specified and type is not, infer type from data - if isinstance(data, pd.DataFrame) and "type" not in attrs: - # Remove escape sequences so that types can be inferred for columns with special characters - if "field" in attrs and attrs["field"].replace("\\", "") in data.columns: - attrs["type"] = infer_vegalite_type(data[attrs["field"].replace("\\", "")]) - # ordered categorical dataframe columns return the type and sort order as a tuple - if isinstance(attrs["type"], tuple): - attrs["sort"] = attrs["type"][1] - attrs["type"] = attrs["type"][0] + if "type" not in attrs: + if pyarrow_available() and data is not None and hasattr(data, "__dataframe__"): + dfi = data.__dataframe__() + if "field" in attrs: + unescaped_field = attrs["field"].replace("\\", "") + if unescaped_field in dfi.column_names(): + column = dfi.get_column_by_name(unescaped_field) + attrs["type"] = infer_vegalite_type_for_dfi_column(column) + if isinstance(attrs["type"], tuple): + attrs["sort"] = attrs["type"][1] + attrs["type"] = attrs["type"][0] + elif isinstance(data, pd.DataFrame): + # Fallback if pyarrow is not installed or if pandas is older than 1.5 + # + # Remove escape sequences so that types can be inferred for columns with special characters + if "field" in attrs and attrs["field"].replace("\\", "") in data.columns: + attrs["type"] = infer_vegalite_type( + data[attrs["field"].replace("\\", "")] + ) + # ordered categorical dataframe columns return the type and sort order as a tuple + if isinstance(attrs["type"], tuple): + attrs["sort"] = attrs["type"][1] + attrs["type"] = attrs["type"][0] # If an unescaped colon is still present, it's often due to an incorrect data type specification # but could also be due to using a column name with ":" in it. @@ -602,6 +621,43 @@ def parse_shorthand( return attrs +def infer_vegalite_type_for_dfi_column( + column: Union[Column, PandasColumn], +) -> Union[_InferredVegaLiteType, Tuple[_InferredVegaLiteType, list]]: + from pyarrow.interchange.from_dataframe import column_to_array + + try: + kind = column.dtype[0] + except NotImplementedError as e: + # Edge case hack: + # dtype access fails for pandas column with datetime64[ns, UTC] type, + # but all we need to know is that its temporal, so check the + # error message for the presence of datetime64. + # + # See https://github.com/pandas-dev/pandas/issues/54239 + if "datetime64" in e.args[0]: + return "temporal" + raise e + + if ( + kind == DtypeKind.CATEGORICAL + and column.describe_categorical["is_ordered"] + and column.describe_categorical["categories"] is not None + ): + # Treat ordered categorical column as Vega-Lite ordinal + categories_column = column.describe_categorical["categories"] + categories_array = column_to_array(categories_column) + return "ordinal", categories_array.to_pylist() + if kind in (DtypeKind.STRING, DtypeKind.CATEGORICAL, DtypeKind.BOOL): + return "nominal" + elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT): + return "quantitative" + elif kind == DtypeKind.DATETIME: + return "temporal" + else: + raise ValueError(f"Unexpected DtypeKind: {kind}") + + def use_signature(Obj: Callable[_P, Any]): """Apply call signature and documentation of Obj to the decorated method""" diff --git a/altair/utils/data.py b/altair/utils/data.py index 6bce09e13..64bf782f4 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -368,3 +368,11 @@ def import_pyarrow_interchange() -> ModuleType: "The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. " "Please update 'pyarrow' to use the DataFrame Interchange Protocol." ) from err + + +def pyarrow_available() -> bool: + try: + import_pyarrow_interchange() + return True + except ImportError: + return False diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst index b98d01a11..cf0ec6f5b 100644 --- a/doc/releases/changes.rst +++ b/doc/releases/changes.rst @@ -8,6 +8,7 @@ Version 5.1.0 (unreleased month date, year) Enhancements ~~~~~~~~~~~~ + - Support field encoding inference for objects that support the DataFrame Interchange Protocol (#3114) Bug Fixes ~~~~~~~~~ diff --git a/tests/test_transformed_data.py b/tests/test_transformed_data.py index 709961a34..64675486e 100644 --- a/tests/test_transformed_data.py +++ b/tests/test_transformed_data.py @@ -5,7 +5,13 @@ import pkgutil import pytest +try: + import vegafusion as vf # type: ignore +except ImportError: + vf = None + +@pytest.mark.skipif(vf is None, reason="vegafusion not installed") # fmt: off @pytest.mark.parametrize("filename,rows,cols", [ ("annual_weather_heatmap.py", 366, ["monthdate_date_end", "max_temp_max"]), @@ -72,6 +78,7 @@ def test_primitive_chart_examples(filename, rows, cols, to_reconstruct): assert set(cols).issubset(set(df.columns)) +@pytest.mark.skipif(vf is None, reason="vegafusion not installed") # fmt: off @pytest.mark.parametrize("filename,all_rows,all_cols", [ ("errorbars_with_std.py", [10, 10], [["upper_yield"], ["extent_yield"]]), @@ -124,6 +131,7 @@ def test_compound_chart_examples(filename, all_rows, all_cols, to_reconstruct): assert set(cols).issubset(set(df.columns)) +@pytest.mark.skipif(vf is None, reason="vegafusion not installed") @pytest.mark.parametrize("to_reconstruct", [True, False]) def test_transformed_data_exclude(to_reconstruct): source = data.wheat() diff --git a/tests/utils/test_dataframe_interchange.py b/tests/utils/test_dataframe_interchange.py index 6fc3a393f..9803a0833 100644 --- a/tests/utils/test_dataframe_interchange.py +++ b/tests/utils/test_dataframe_interchange.py @@ -1,10 +1,14 @@ from datetime import datetime -import pyarrow as pa import pandas as pd import pytest import sys import os +try: + import pyarrow as pa +except ImportError: + pa = None + from altair.utils.data import to_values @@ -25,6 +29,7 @@ def windows_has_tzdata(): sys.platform == "win32" and not windows_has_tzdata(), reason="Timezone database is not installed on Windows", ) +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") def test_arrow_timestamp_conversion(): """Test that arrow timestamp values are converted to ISO-8601 strings""" data = { @@ -44,6 +49,7 @@ def test_arrow_timestamp_conversion(): assert values == expected_values +@pytest.mark.skipif(pa is None, reason="pyarrow not installed") def test_duration_raises(): td = pd.timedelta_range(0, periods=3, freq="h") df = pd.DataFrame(td).reset_index() diff --git a/tests/utils/test_mimebundle.py b/tests/utils/test_mimebundle.py index d26889cb0..541ac483f 100644 --- a/tests/utils/test_mimebundle.py +++ b/tests/utils/test_mimebundle.py @@ -14,6 +14,11 @@ except ImportError: vlc = None +try: + import vegafusion as vf # type: ignore +except ImportError: + vf = None + @pytest.fixture def vegalite_spec(): @@ -242,6 +247,7 @@ def check_pre_transformed_vega_spec(vega_spec): assert len(data_0.get("transform", [])) == 0 +@pytest.mark.skipif(vf is None, reason="vegafusion is not installed") def test_vegafusion_spec_to_vega_mime_bundle(vegalite_spec): with alt.data_transformers.enable("vegafusion"): bundle = spec_to_mimebundle( @@ -254,6 +260,7 @@ def test_vegafusion_spec_to_vega_mime_bundle(vegalite_spec): check_pre_transformed_vega_spec(vega_spec) +@pytest.mark.skipif(vf is None, reason="vegafusion is not installed") def test_vegafusion_chart_to_vega_mime_bundle(vegalite_spec): chart = alt.Chart.from_dict(vegalite_spec) with alt.data_transformers.enable("vegafusion"), alt.renderers.enable("json"):