From 836b8cfeacb0e280d5373efeb77b039e93de41d6 Mon Sep 17 00:00:00 2001
From: Jon Mease <jonmmease@gmail.com>
Date: Mon, 24 Jul 2023 16:30:17 -0400
Subject: [PATCH] Support for type inference for DataFrames using the DataFrame
 Interchange Protocol (#3114)

* Add DataFrame Interchange Protocol types from spec for type checking

* Tests shouldn't fail when pyarrow and vegafusion aren't installed, skip instead

* Compute parse_shorthand based on __dataframe__ when possible

If the pyarrow data interchange module is available and the dataset has a __dataframe__ method, infer column types based on the DataFrame Interchange Protocol.

Fall back to pandas implementation for pandas DataFrames if pyarrow is not available or pandas is older that 1.5.

* Add comment explaining why we uninstall optional dependencies

* Add pandas GitHub issue reference

* Add changelog entry
---
 .github/workflows/build.yml               |   8 +
 altair/utils/_dfi_types.py                | 508 ++++++++++++++++++++++
 altair/utils/core.py                      |  76 +++-
 altair/utils/data.py                      |   8 +
 doc/releases/changes.rst                  |   1 +
 tests/test_transformed_data.py            |   8 +
 tests/utils/test_dataframe_interchange.py |   8 +-
 tests/utils/test_mimebundle.py            |   7 +
 8 files changed, 613 insertions(+), 11 deletions(-)
 create mode 100644 altair/utils/_dfi_types.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bc4774cb3..75aa922c1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -28,6 +28,14 @@ jobs:
           pip install .[dev]
           # pip install "selenium<4.3.0"
           # pip install altair_saver
+      - name: Maybe uninstall optional dependencies
+        # We uninstall pyarrow and vegafusion for one job to test that we have not
+        # accidentally introduced a hard dependency on these libraries.
+        # Uninstalling for Python 3.9 is an arbitrary choice.
+        # Also see https://github.com/altair-viz/altair/pull/3114
+        if: ${{ matrix.python-version }}=="3.9"
+        run: |
+          pip uninstall -y pyarrow vegafusion vegafusion-python-embed
       - name: Test that schema generation has no effect
         run: |
           python tools/generate_schema_wrapper.py
diff --git a/altair/utils/_dfi_types.py b/altair/utils/_dfi_types.py
new file mode 100644
index 000000000..16b83fb4d
--- /dev/null
+++ b/altair/utils/_dfi_types.py
@@ -0,0 +1,508 @@
+# DataFrame Interchange Protocol Types
+# Copied from https://data-apis.org/dataframe-protocol/latest/API.html
+#
+# These classes are only for use in type signatures
+from abc import (
+    ABC,
+    abstractmethod,
+)
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Tuple,
+    TypedDict,
+)
+
+
+class DlpackDeviceType(enum.IntEnum):
+    """Integer enum for device type codes matching DLPack."""
+
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
+
+
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple["Buffer", Dtype]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple["Buffer", Dtype]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple["Buffer", Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``).
+    # None if not a dictionary-style categorical.
+    categories: "Optional[Column]"
+
+
+class Buffer(ABC):
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    """
+
+    @property
+    @abstractmethod
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        pass
+
+    @abstractmethod
+    def __dlpack__(self):
+        """
+        Produce DLPack capsule (see array API standard).
+
+        Raises:
+
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        """
+        raise NotImplementedError("__dlpack__")
+
+    @abstractmethod
+    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        Uses device type codes matching DLPack.
+        Note: must be implemented even if ``__dlpack__`` is not.
+        """
+        pass
+
+
+class Column(ABC):
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+    """
+
+    @abstractmethod
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def offset(self) -> int:
+        """
+        Offset of first element.
+
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def dtype(self) -> Dtype:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def describe_categorical(self) -> CategoricalDescription:
+        """
+        If the dtype is categorical, there are two options:
+        - There are only values in the data buffer.
+        - There is a separate non-categorical Column encoding categorical values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Returns the dictionary with description on how to interpret the data buffer:
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a mapping of
+                                categorical values to other objects exists
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def describe_null(self) -> Tuple[ColumnNullType, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Value : if kind is "sentinel value", the actual value. If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value. None
+        otherwise.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def null_count(self) -> Optional[int]:
+        """
+        Number of null elements, if known.
+
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the column. See `DataFrame.metadata` for more details.
+        """
+        pass
+
+    @abstractmethod
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        pass
+
+    @abstractmethod
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        pass
+
+    @abstractmethod
+    def get_buffers(self) -> ColumnBuffers:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        pass
+
+
+#    def get_children(self) -> Iterable[Column]:
+#        """
+#        Children columns underneath the column, each object in this iterator
+#        must adhere to the column specification.
+#        """
+#        pass
+
+
+class DataFrame(ABC):
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    A "data frame" represents an ordered collection of named columns.
+    A column's "name" must be a unique string.
+    Columns may be accessed by name or by position.
+
+    This could be a public data frame class, or an object with the methods and
+    attributes defined on this DataFrame class could be returned from the
+    ``__dataframe__`` method of a public data frame class in a library adhering
+    to the dataframe interchange protocol specification.
+    """
+
+    version = 0  # version of the protocol
+
+    @abstractmethod
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> "DataFrame":
+        """
+        Construct a new exchange object, potentially changing the parameters.
+
+        ``nan_as_null`` is a keyword intended for the consumer to tell the
+        producer to overwrite null values in the data with ``NaN``.
+        It is intended for cases where the consumer does not support the bit
+        mask or byte mask that is the producer's native representation.
+        ``allow_copy`` is a keyword that defines whether or not the library is
+        allowed to make a copy of the data. For example, copying data would be
+        necessary if a library supports strided buffers, given that this protocol
+        specifies contiguous buffers.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the data frame, as a dictionary with string keys. The
+        contents of `metadata` may be anything, they are meant for a library
+        to store information that it needs to, e.g., roundtrip losslessly or
+        for two implementations to share data that is not (yet) part of the
+        interchange protocol specification. For avoiding collisions with other
+        entries, please add name the keys with the name of the library
+        followed by a period and the desired name, e.g, ``pandas.indexcol``.
+        """
+        pass
+
+    @abstractmethod
+    def num_columns(self) -> int:
+        """
+        Return the number of columns in the DataFrame.
+        """
+        pass
+
+    @abstractmethod
+    def num_rows(self) -> Optional[int]:
+        # TODO: not happy with Optional, but need to flag it may be expensive
+        #       why include it if it may be None - what do we expect consumers
+        #       to do here?
+        """
+        Return the number of rows in the DataFrame, if available.
+        """
+        pass
+
+    @abstractmethod
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the DataFrame consists of.
+        """
+        pass
+
+    @abstractmethod
+    def column_names(self) -> Iterable[str]:
+        """
+        Return an iterator yielding the column names.
+        """
+        pass
+
+    @abstractmethod
+    def get_column(self, i: int) -> Column:
+        """
+        Return the column at the indicated position.
+        """
+        pass
+
+    @abstractmethod
+    def get_column_by_name(self, name: str) -> Column:
+        """
+        Return the column whose name is the indicated name.
+        """
+        pass
+
+    @abstractmethod
+    def get_columns(self) -> Iterable[Column]:
+        """
+        Return an iterator yielding the columns.
+        """
+        pass
+
+    @abstractmethod
+    def select_columns(self, indices: Sequence[int]) -> "DataFrame":
+        """
+        Create a new DataFrame by selecting a subset of columns by index.
+        """
+        pass
+
+    @abstractmethod
+    def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
+        """
+        Create a new DataFrame by selecting a subset of columns by name.
+        """
+        pass
+
+    @abstractmethod
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]:
+        """
+        Return an iterator yielding the chunks.
+
+        By default (None), yields the chunks that the data is stored as by the
+        producer. If given, ``n_chunks`` must be a multiple of
+        ``self.num_chunks()``, meaning the producer must subdivide each chunk
+        before yielding it.
+
+        Note that the producer must ensure that all columns are chunked the
+        same way.
+        """
+        pass
diff --git a/altair/utils/core.py b/altair/utils/core.py
index 4a375d254..1d4d6f17b 100644
--- a/altair/utils/core.py
+++ b/altair/utils/core.py
@@ -15,8 +15,10 @@
 import jsonschema
 import pandas as pd
 import numpy as np
+from pandas.core.interchange.dataframe_protocol import Column as PandasColumn
 
 from altair.utils.schemapi import SchemaBase
+from altair.utils._dfi_types import Column, DtypeKind, DataFrame as DfiDataFrame
 
 if sys.version_info >= (3, 10):
     from typing import ParamSpec
@@ -36,7 +38,7 @@
 
 
 class _DataFrameLike(Protocol):
-    def __dataframe__(self, *args, **kwargs):
+    def __dataframe__(self, *args, **kwargs) -> DfiDataFrame:
         ...
 
 
@@ -436,7 +438,7 @@ def sanitize_arrow_table(pa_table):
 
 def parse_shorthand(
     shorthand: Union[Dict[str, Any], str],
-    data: Optional[pd.DataFrame] = None,
+    data: Optional[Union[pd.DataFrame, _DataFrameLike]] = None,
     parse_aggregates: bool = True,
     parse_window_ops: bool = False,
     parse_timeunits: bool = True,
@@ -516,6 +518,8 @@ def parse_shorthand(
     >>> parse_shorthand('count()', data) == {'aggregate': 'count', 'type': 'quantitative'}
     True
     """
+    from altair.utils.data import pyarrow_available
+
     if not shorthand:
         return {}
 
@@ -574,14 +578,29 @@ def parse_shorthand(
         attrs["type"] = "temporal"
 
     # if data is specified and type is not, infer type from data
-    if isinstance(data, pd.DataFrame) and "type" not in attrs:
-        # Remove escape sequences so that types can be inferred for columns with special characters
-        if "field" in attrs and attrs["field"].replace("\\", "") in data.columns:
-            attrs["type"] = infer_vegalite_type(data[attrs["field"].replace("\\", "")])
-            # ordered categorical dataframe columns return the type and sort order as a tuple
-            if isinstance(attrs["type"], tuple):
-                attrs["sort"] = attrs["type"][1]
-                attrs["type"] = attrs["type"][0]
+    if "type" not in attrs:
+        if pyarrow_available() and data is not None and hasattr(data, "__dataframe__"):
+            dfi = data.__dataframe__()
+            if "field" in attrs:
+                unescaped_field = attrs["field"].replace("\\", "")
+                if unescaped_field in dfi.column_names():
+                    column = dfi.get_column_by_name(unescaped_field)
+                    attrs["type"] = infer_vegalite_type_for_dfi_column(column)
+                    if isinstance(attrs["type"], tuple):
+                        attrs["sort"] = attrs["type"][1]
+                        attrs["type"] = attrs["type"][0]
+        elif isinstance(data, pd.DataFrame):
+            # Fallback if pyarrow is not installed or if pandas is older than 1.5
+            #
+            # Remove escape sequences so that types can be inferred for columns with special characters
+            if "field" in attrs and attrs["field"].replace("\\", "") in data.columns:
+                attrs["type"] = infer_vegalite_type(
+                    data[attrs["field"].replace("\\", "")]
+                )
+                # ordered categorical dataframe columns return the type and sort order as a tuple
+                if isinstance(attrs["type"], tuple):
+                    attrs["sort"] = attrs["type"][1]
+                    attrs["type"] = attrs["type"][0]
 
     # If an unescaped colon is still present, it's often due to an incorrect data type specification
     # but could also be due to using a column name with ":" in it.
@@ -602,6 +621,43 @@ def parse_shorthand(
     return attrs
 
 
+def infer_vegalite_type_for_dfi_column(
+    column: Union[Column, PandasColumn],
+) -> Union[_InferredVegaLiteType, Tuple[_InferredVegaLiteType, list]]:
+    from pyarrow.interchange.from_dataframe import column_to_array
+
+    try:
+        kind = column.dtype[0]
+    except NotImplementedError as e:
+        # Edge case hack:
+        # dtype access fails for pandas column with datetime64[ns, UTC] type,
+        # but all we need to know is that its temporal, so check the
+        # error message for the presence of datetime64.
+        #
+        # See https://github.com/pandas-dev/pandas/issues/54239
+        if "datetime64" in e.args[0]:
+            return "temporal"
+        raise e
+
+    if (
+        kind == DtypeKind.CATEGORICAL
+        and column.describe_categorical["is_ordered"]
+        and column.describe_categorical["categories"] is not None
+    ):
+        # Treat ordered categorical column as Vega-Lite ordinal
+        categories_column = column.describe_categorical["categories"]
+        categories_array = column_to_array(categories_column)
+        return "ordinal", categories_array.to_pylist()
+    if kind in (DtypeKind.STRING, DtypeKind.CATEGORICAL, DtypeKind.BOOL):
+        return "nominal"
+    elif kind in (DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT):
+        return "quantitative"
+    elif kind == DtypeKind.DATETIME:
+        return "temporal"
+    else:
+        raise ValueError(f"Unexpected DtypeKind: {kind}")
+
+
 def use_signature(Obj: Callable[_P, Any]):
     """Apply call signature and documentation of Obj to the decorated method"""
 
diff --git a/altair/utils/data.py b/altair/utils/data.py
index 6bce09e13..64bf782f4 100644
--- a/altair/utils/data.py
+++ b/altair/utils/data.py
@@ -368,3 +368,11 @@ def import_pyarrow_interchange() -> ModuleType:
             "The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. "
             "Please update 'pyarrow' to use the DataFrame Interchange Protocol."
         ) from err
+
+
+def pyarrow_available() -> bool:
+    try:
+        import_pyarrow_interchange()
+        return True
+    except ImportError:
+        return False
diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst
index b98d01a11..cf0ec6f5b 100644
--- a/doc/releases/changes.rst
+++ b/doc/releases/changes.rst
@@ -8,6 +8,7 @@ Version 5.1.0 (unreleased month date, year)
 
 Enhancements
 ~~~~~~~~~~~~
+ - Support field encoding inference for objects that support the DataFrame Interchange Protocol (#3114)
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/tests/test_transformed_data.py b/tests/test_transformed_data.py
index 709961a34..64675486e 100644
--- a/tests/test_transformed_data.py
+++ b/tests/test_transformed_data.py
@@ -5,7 +5,13 @@
 import pkgutil
 import pytest
 
+try:
+    import vegafusion as vf  # type: ignore
+except ImportError:
+    vf = None
 
+
+@pytest.mark.skipif(vf is None, reason="vegafusion not installed")
 # fmt: off
 @pytest.mark.parametrize("filename,rows,cols", [
     ("annual_weather_heatmap.py", 366, ["monthdate_date_end", "max_temp_max"]),
@@ -72,6 +78,7 @@ def test_primitive_chart_examples(filename, rows, cols, to_reconstruct):
     assert set(cols).issubset(set(df.columns))
 
 
+@pytest.mark.skipif(vf is None, reason="vegafusion not installed")
 # fmt: off
 @pytest.mark.parametrize("filename,all_rows,all_cols", [
     ("errorbars_with_std.py", [10, 10], [["upper_yield"], ["extent_yield"]]),
@@ -124,6 +131,7 @@ def test_compound_chart_examples(filename, all_rows, all_cols, to_reconstruct):
             assert set(cols).issubset(set(df.columns))
 
 
+@pytest.mark.skipif(vf is None, reason="vegafusion not installed")
 @pytest.mark.parametrize("to_reconstruct", [True, False])
 def test_transformed_data_exclude(to_reconstruct):
     source = data.wheat()
diff --git a/tests/utils/test_dataframe_interchange.py b/tests/utils/test_dataframe_interchange.py
index 6fc3a393f..9803a0833 100644
--- a/tests/utils/test_dataframe_interchange.py
+++ b/tests/utils/test_dataframe_interchange.py
@@ -1,10 +1,14 @@
 from datetime import datetime
-import pyarrow as pa
 import pandas as pd
 import pytest
 import sys
 import os
 
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
 from altair.utils.data import to_values
 
 
@@ -25,6 +29,7 @@ def windows_has_tzdata():
     sys.platform == "win32" and not windows_has_tzdata(),
     reason="Timezone database is not installed on Windows",
 )
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
 def test_arrow_timestamp_conversion():
     """Test that arrow timestamp values are converted to ISO-8601 strings"""
     data = {
@@ -44,6 +49,7 @@ def test_arrow_timestamp_conversion():
     assert values == expected_values
 
 
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
 def test_duration_raises():
     td = pd.timedelta_range(0, periods=3, freq="h")
     df = pd.DataFrame(td).reset_index()
diff --git a/tests/utils/test_mimebundle.py b/tests/utils/test_mimebundle.py
index d26889cb0..541ac483f 100644
--- a/tests/utils/test_mimebundle.py
+++ b/tests/utils/test_mimebundle.py
@@ -14,6 +14,11 @@
 except ImportError:
     vlc = None
 
+try:
+    import vegafusion as vf  # type: ignore
+except ImportError:
+    vf = None
+
 
 @pytest.fixture
 def vegalite_spec():
@@ -242,6 +247,7 @@ def check_pre_transformed_vega_spec(vega_spec):
     assert len(data_0.get("transform", [])) == 0
 
 
+@pytest.mark.skipif(vf is None, reason="vegafusion is not installed")
 def test_vegafusion_spec_to_vega_mime_bundle(vegalite_spec):
     with alt.data_transformers.enable("vegafusion"):
         bundle = spec_to_mimebundle(
@@ -254,6 +260,7 @@ def test_vegafusion_spec_to_vega_mime_bundle(vegalite_spec):
         check_pre_transformed_vega_spec(vega_spec)
 
 
+@pytest.mark.skipif(vf is None, reason="vegafusion is not installed")
 def test_vegafusion_chart_to_vega_mime_bundle(vegalite_spec):
     chart = alt.Chart.from_dict(vegalite_spec)
     with alt.data_transformers.enable("vegafusion"), alt.renderers.enable("json"):