diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py
index dc018e7816..5fd11225ba 100644
--- a/narwhals/_dask/group_by.py
+++ b/narwhals/_dask/group_by.py
@@ -26,10 +26,10 @@
 
 
 def n_unique() -> dd.Aggregation:
-    def chunk(s: pd.core.groupby.generic.SeriesGroupBy) -> int:
+    def chunk(s: pd.core.groupby.generic.SeriesGroupBy) -> pd.Series[Any]:
         return s.nunique(dropna=False)  # type: ignore[no-any-return]
 
-    def agg(s0: pd.core.groupby.generic.SeriesGroupBy) -> int:
+    def agg(s0: pd.core.groupby.generic.SeriesGroupBy) -> pd.Series[Any]:
         return s0.sum()  # type: ignore[no-any-return]
 
     return dd.Aggregation(
diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py
index cde2a7861f..f75b8bacb5 100644
--- a/narwhals/_dask/namespace.py
+++ b/narwhals/_dask/namespace.py
@@ -77,18 +77,12 @@ def nth(self: Self, *column_indices: int) -> DaskExpr:
 
     def lit(self: Self, value: Any, dtype: DType | None) -> DaskExpr:
         def func(df: DaskLazyFrame) -> list[dx.Series]:
-            return [
-                dd.from_pandas(
-                    pd.Series(
-                        [value],
-                        dtype=narwhals_to_native_dtype(dtype, self._version)
-                        if dtype is not None
-                        else None,
-                        name="literal",
-                    ),
-                    npartitions=df._native_frame.npartitions,
-                )
-            ]
+            if dtype is not None:
+                native_dtype = narwhals_to_native_dtype(dtype, self._version)
+                s = pd.Series([value], dtype=native_dtype)
+            else:
+                s = pd.Series([value])
+            return [dd.from_pandas(s, npartitions=df._native_frame.npartitions)]
 
         return DaskExpr(
             func,
diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py
index 1eff70cefa..98f613e9fa 100644
--- a/narwhals/_pandas_like/utils.py
+++ b/narwhals/_pandas_like/utils.py
@@ -9,6 +9,7 @@
 from typing import Iterable
 from typing import Sequence
 from typing import TypeVar
+from typing import cast
 
 import pandas as pd
 
@@ -511,7 +512,11 @@ def get_dtype_backend(dtype: Any, implementation: Implementation) -> DTypeBacken
     if hasattr(pd, "ArrowDtype") and isinstance(dtype, pd.ArrowDtype):
         return "pyarrow"
     with suppress(AttributeError):
-        if isinstance(dtype, pd.core.dtypes.dtypes.BaseMaskedDtype):
+        sentinel = object()
+        if (
+            isinstance(dtype, pd.api.extensions.ExtensionDtype)
+            and getattr(dtype, "base", sentinel) is None
+        ):
             return "numpy_nullable"
     return None
 
@@ -722,8 +727,17 @@ def int_dtype_mapper(dtype: Any) -> str:
 def convert_str_slice_to_int_slice(
     str_slice: slice, columns: pd.Index
 ) -> tuple[int | None, int | None, int | None]:
-    start = columns.get_loc(str_slice.start) if str_slice.start is not None else None
-    stop = columns.get_loc(str_slice.stop) + 1 if str_slice.stop is not None else None
+    # We can safely cast to int because we know that `columns` doesn't contain duplicates.
+    start = (
+        cast(int, columns.get_loc(str_slice.start))
+        if str_slice.start is not None
+        else None
+    )
+    stop = (
+        cast(int, columns.get_loc(str_slice.stop)) + 1
+        if str_slice.stop is not None
+        else None
+    )
     step = str_slice.step
     return (start, stop, step)
 
diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py
index 27eab9d263..461932e5d1 100644
--- a/narwhals/_polars/series.py
+++ b/narwhals/_polars/series.py
@@ -483,7 +483,7 @@ def hist(
                 version=self._version,
             )
         elif (self._backend_version < (1, 15)) and self._native_series.count() < 1:
-            data_dict: dict[str, list[int | float] | pl.Series | pl.Expr]
+            data_dict: dict[str, Sequence[Any] | pl.Series]
             if bins is not None:
                 data_dict = {
                     "breakpoint": bins[1:],
diff --git a/narwhals/utils.py b/narwhals/utils.py
index 031c466f8a..0d0b50b76b 100644
--- a/narwhals/utils.py
+++ b/narwhals/utils.py
@@ -977,7 +977,7 @@ def is_ordered_categorical(series: Series[Any]) -> bool:
     if is_polars_series(native_series):
         return native_series.dtype.ordering == "physical"  # type: ignore[attr-defined, no-any-return]
     if is_pandas_series(native_series):
-        return native_series.cat.ordered  # type: ignore[no-any-return]
+        return bool(native_series.cat.ordered)
     if is_modin_series(native_series):  # pragma: no cover
         return native_series.cat.ordered  # type: ignore[no-any-return]
     if is_cudf_series(native_series):  # pragma: no cover
diff --git a/pyproject.toml b/pyproject.toml
index c339e04236..d1b78c72a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ tests = [
   "typing_extensions",
 ]
 typing = [
+  "pandas-stubs",
   "typing_extensions",
   "mypy~=1.15.0",
 ]
@@ -226,8 +227,6 @@ pretty = true
 
 [[tool.mypy.overrides]]
 module = [
-  # TODO: enable step by step when it makes sense
-  # e.g. the pandas API is just too inconsistent for type hinting to be useful.
   "cudf.*",
   "dask.*",
   "dask_expr.*",
@@ -235,7 +234,6 @@ module = [
   "ibis.*",
   "modin.*",
   "numpy.*",
-  "pandas.*",
   "pyspark.*",
   "sklearn.*",
   "sqlframe.*",
diff --git a/tests/conftest.py b/tests/conftest.py
index 02c0921f2a..c8c4895ccf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -132,7 +132,7 @@ def dask_lazy_p2_constructor(obj: dict[str, list[Any]]) -> IntoFrame:  # pragma:
     return dd.from_dict(obj, npartitions=2)  # type: ignore[no-any-return]
 
 
-def pyarrow_table_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
+def pyarrow_table_constructor(obj: dict[str, Any]) -> IntoDataFrame:
     return pa.table(obj)  # type: ignore[no-any-return]
 
 
@@ -227,7 +227,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             if x not in GPU_CONSTRUCTORS and x != "modin"  # too slow
         ]
     else:  # pragma: no cover
-        selected_constructors = metafunc.config.getoption("constructors").split(",")
+        selected_constructors = metafunc.config.getoption("constructors").split(",")  # pyright: ignore[reportAttributeAccessIssue]
 
     eager_constructors: list[Callable[[Any], IntoDataFrame]] = []
     eager_constructors_ids: list[str] = []
diff --git a/tests/dependencies/is_into_dataframe_test.py b/tests/dependencies/is_into_dataframe_test.py
index 80bd5edd8d..4efa7b2cc9 100644
--- a/tests/dependencies/is_into_dataframe_test.py
+++ b/tests/dependencies/is_into_dataframe_test.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Mapping
 
 import numpy as np
 import pandas as pd
@@ -16,7 +17,7 @@
 
 
 class DictDataFrame:
-    def __init__(self: Self, data: dict[str, list[Any]]) -> None:
+    def __init__(self: Self, data: Mapping[str, Any]) -> None:
         self._data = data
 
     def __len__(self) -> int:  # pragma: no cover
@@ -27,7 +28,7 @@ def __narwhals_dataframe__(self) -> Self:  # pragma: no cover
 
 
 def test_is_into_dataframe() -> None:
-    data = {"a": [1, 2, 3], "b": [4, 5, 6]}
+    data: dict[str, Any] = {"a": [1, 2, 3], "b": [4, 5, 6]}
     assert is_into_dataframe(pa.table(data))
     assert is_into_dataframe(pl.DataFrame(data))
     assert is_into_dataframe(pd.DataFrame(data))
diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py
index f6bd06de2e..ac69b0af77 100644
--- a/tests/dtypes_test.py
+++ b/tests/dtypes_test.py
@@ -15,8 +15,10 @@
 import narwhals.stable.v1 as nw
 from tests.utils import PANDAS_VERSION
 from tests.utils import POLARS_VERSION
+from tests.utils import PYARROW_VERSION
 
 if TYPE_CHECKING:
+    from narwhals.typing import IntoSeries
     from tests.utils import Constructor
 
 
@@ -135,6 +137,8 @@ def test_struct_hashes() -> None:
 def test_2d_array(constructor: Constructor, request: pytest.FixtureRequest) -> None:
     if any(x in str(constructor) for x in ("dask", "modin", "cudf", "pyspark")):
         request.applymarker(pytest.mark.xfail)
+    if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14,):
+        request.applymarker(pytest.mark.xfail)
     data = {"a": [[[1, 2], [3, 4], [5, 6]]]}
     df = nw.from_native(constructor(data)).with_columns(
         a=nw.col("a").cast(nw.Array(nw.Int64(), (3, 2)))
@@ -144,13 +148,15 @@ def test_2d_array(constructor: Constructor, request: pytest.FixtureRequest) -> N
 
 
 def test_second_time_unit() -> None:
-    s = pd.Series(np.array([np.datetime64("2020-01-01", "s")]))
+    s: IntoSeries = pd.Series(np.array([np.datetime64("2020-01-01", "s")]))
     result = nw.from_native(s, series_only=True)
     if PANDAS_VERSION < (2,):  # pragma: no cover
         assert result.dtype == nw.Datetime("ns")
     else:
         assert result.dtype == nw.Datetime("s")
-    s = pa.chunked_array([pa.array([datetime(2020, 1, 1)], type=pa.timestamp("s"))])
+    ts_sec = pa.timestamp("s")
+    dur_sec = pa.duration("s")
+    s = pa.chunked_array([pa.array([datetime(2020, 1, 1)], type=ts_sec)], type=ts_sec)
     result = nw.from_native(s, series_only=True)
     assert result.dtype == nw.Datetime("s")
     s = pd.Series(np.array([np.timedelta64(1, "s")]))
@@ -159,7 +165,7 @@ def test_second_time_unit() -> None:
         assert result.dtype == nw.Duration("ns")
     else:
         assert result.dtype == nw.Duration("s")
-    s = pa.chunked_array([pa.array([timedelta(1)], type=pa.duration("s"))])
+    s = pa.chunked_array([pa.array([timedelta(1)], type=dur_sec)], type=dur_sec)
     result = nw.from_native(s, series_only=True)
     assert result.dtype == nw.Duration("s")
 
diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py
index 4260e40880..fd535c7acc 100644
--- a/tests/expr_and_series/arithmetic_test.py
+++ b/tests/expr_and_series/arithmetic_test.py
@@ -166,9 +166,9 @@ def test_floordiv(left: int, right: int) -> None:
     # test is a bit manual unfortunately
     assume(right != 0)
     expected = {"a": [left // right]}
-    result = nw.from_native(pd.DataFrame({"a": [left]}), eager_only=True).select(
-        nw.col("a") // right
-    )
+    result: nw.DataFrame[Any] = nw.from_native(
+        pd.DataFrame({"a": [left]}), eager_only=True
+    ).select(nw.col("a") // right)
     assert_equal_data(result, expected)
     if PANDAS_VERSION < (2, 2):  # pragma: no cover
         # Bug in old version of pandas
@@ -201,9 +201,9 @@ def test_mod(left: int, right: int) -> None:
     # test is a bit manual unfortunately
     assume(right != 0)
     expected = {"a": [left % right]}
-    result = nw.from_native(pd.DataFrame({"a": [left]}), eager_only=True).select(
-        nw.col("a") % right
-    )
+    result: nw.DataFrame[Any] = nw.from_native(
+        pd.DataFrame({"a": [left]}), eager_only=True
+    ).select(nw.col("a") % right)
     assert_equal_data(result, expected)
     result = nw.from_native(
         pd.DataFrame({"a": [left]}).convert_dtypes(), eager_only=True
diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py
index 442b478bae..0283f2381d 100644
--- a/tests/expr_and_series/nth_test.py
+++ b/tests/expr_and_series/nth_test.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
+
 import polars as pl
 import pytest
 
@@ -8,7 +11,7 @@
 from tests.utils import Constructor
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
+data: Mapping[str, Any] = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
 
 
 @pytest.mark.parametrize(
diff --git a/tests/frame/getitem_test.py b/tests/frame/getitem_test.py
index afab298145..bf64979ae8 100644
--- a/tests/frame/getitem_test.py
+++ b/tests/frame/getitem_test.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
 from typing import Any
+from typing import cast
 
 import numpy as np
 import pandas as pd
@@ -12,7 +14,10 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {
+if TYPE_CHECKING:
+    from narwhals.typing import _1DArray
+
+data: dict[str, Any] = {
     "a": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
     "b": [11, 12, 13, 14, 15, 16],
 }
@@ -74,7 +79,8 @@ def test_gather(constructor_eager: ConstructorEager) -> None:
         "b": [11, 14, 12],
     }
     assert_equal_data(result, expected)
-    result = df[np.array([0, 3, 1])]
+    arr = cast("_1DArray", np.array([0, 3, 1]))
+    result = df[arr]
     assert_equal_data(result, expected)
 
 
@@ -96,10 +102,10 @@ def test_gather_rows_cols(constructor_eager: ConstructorEager) -> None:
 
     expected = {"b": [11, 14, 12]}
 
-    result = {"b": df[[0, 3, 1], 1]}
+    result: Any = {"b": df[[0, 3, 1], 1]}
     assert_equal_data(result, expected)
-
-    result = {"b": df[np.array([0, 3, 1]), "b"]}
+    arr = cast("_1DArray", np.array([0, 3, 1]))
+    result = {"b": df[arr, "b"]}
     assert_equal_data(result, expected)
 
 
diff --git a/tests/frame/interchange_native_namespace_test.py b/tests/frame/interchange_native_namespace_test.py
index 67fba1c119..9d79194428 100644
--- a/tests/frame/interchange_native_namespace_test.py
+++ b/tests/frame/interchange_native_namespace_test.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
+
 import polars as pl
 import pytest
 
 import narwhals.stable.v1 as nw
 
-data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
+data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
 
 
 def test_interchange() -> None:
diff --git a/tests/frame/interchange_select_test.py b/tests/frame/interchange_select_test.py
index 4e1400fd78..88234ff83f 100644
--- a/tests/frame/interchange_select_test.py
+++ b/tests/frame/interchange_select_test.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Mapping
 
 import polars as pl
 import pytest
@@ -11,7 +12,7 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]}
+data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]}
 
 
 class InterchangeDataFrame:
diff --git a/tests/frame/interchange_to_arrow_test.py b/tests/frame/interchange_to_arrow_test.py
index 064d15834a..14172e5d57 100644
--- a/tests/frame/interchange_to_arrow_test.py
+++ b/tests/frame/interchange_to_arrow_test.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
+
 import polars as pl
 import pyarrow as pa
 import pytest
 
 import narwhals.stable.v1 as nw
 
-data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]}
+data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]}
 
 
 def test_interchange_to_arrow() -> None:
diff --git a/tests/frame/invalid_test.py b/tests/frame/invalid_test.py
index e5eee63b9a..9bdedbd4b9 100644
--- a/tests/frame/invalid_test.py
+++ b/tests/frame/invalid_test.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pandas as pd
 import polars as pl
 import pyarrow as pa
@@ -8,10 +10,13 @@
 import narwhals.stable.v1 as nw
 from tests.utils import NUMPY_VERSION
 
+if TYPE_CHECKING:
+    from narwhals.typing import Frame
+
 
 def test_invalid() -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
-    df = nw.from_native(pa.table({"a": [1, 2], "b": [3, 4]}))
+    df: Frame = nw.from_native(pa.table({"a": [1, 2], "b": [3, 4]}))
     with pytest.raises(ValueError, match="Multi-output"):
         df.select(nw.all() + nw.all())
     df = nw.from_native(pd.DataFrame(data))
@@ -24,14 +29,14 @@ def test_invalid() -> None:
 
 
 def test_native_vs_non_native() -> None:
-    s = pd.Series([1, 2, 3])
-    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    s_pd = pd.Series([1, 2, 3])
+    df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     with pytest.raises(TypeError, match="Perhaps you forgot"):
-        nw.from_native(df).filter(s > 1)
-    s = pl.Series([1, 2, 3])
-    df = pl.DataFrame({"a": [2, 2, 3], "b": [4, 5, 6]})
+        nw.from_native(df_pd).filter(s_pd > 1)  # type: ignore[arg-type]
+    s_pl = pl.Series([1, 2, 3])
+    df_pl = pl.DataFrame({"a": [2, 2, 3], "b": [4, 5, 6]})
     with pytest.raises(TypeError, match="Perhaps you\n- forgot"):
-        nw.from_native(df).filter(s > 1)
+        nw.from_native(df_pl).filter(s_pl > 1)
 
 
 def test_validate_laziness() -> None:
diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py
index 79c56886a1..fc2a422012 100644
--- a/tests/frame/schema_test.py
+++ b/tests/frame/schema_test.py
@@ -236,39 +236,39 @@ def test_validate_not_duplicated_columns_duckdb() -> None:
 )
 def test_nested_dtypes() -> None:
     duckdb = pytest.importorskip("duckdb")
-    df = pl.DataFrame(
+    df_pd = pl.DataFrame(
         {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]},
         schema_overrides={"b": pl.Array(pl.Int64, 2)},
     ).to_pandas(use_pyarrow_extension_array=True)
-    nwdf = nw.from_native(df)
+    nwdf: nw.DataFrame[Any] | nw.LazyFrame[Any] = nw.from_native(df_pd)
     assert nwdf.schema == {
         "a": nw.List(nw.Int64),
         "b": nw.Array(nw.Int64, 2),
         "c": nw.Struct({"a": nw.Int64}),
     }
-    df = pl.DataFrame(
+    df_pl = pl.DataFrame(
         {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]},
         schema_overrides={"b": pl.Array(pl.Int64, 2)},
     )
-    nwdf = nw.from_native(df)
+    nwdf = nw.from_native(df_pl)
     assert nwdf.schema == {
         "a": nw.List(nw.Int64),
         "b": nw.Array(nw.Int64, 2),
         "c": nw.Struct({"a": nw.Int64}),
     }
 
-    df = pl.DataFrame(
+    df_pa = pl.DataFrame(
         {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1, "b": "x", "c": 1.1}]},
         schema_overrides={"b": pl.Array(pl.Int64, 2)},
     ).to_arrow()
-    nwdf = nw.from_native(df)
+    nwdf = nw.from_native(df_pa)
     assert nwdf.schema == {
         "a": nw.List(nw.Int64),
         "b": nw.Array(nw.Int64, 2),
         "c": nw.Struct({"a": nw.Int64, "b": nw.String, "c": nw.Float64}),
     }
-    df = duckdb.sql("select * from df")
-    nwdf = nw.from_native(df)
+    rel = duckdb.sql("select * from df_pa")
+    nwdf = nw.from_native(rel)
     assert nwdf.schema == {
         "a": nw.List(nw.Int64),
         "b": nw.Array(nw.Int64, 2),
diff --git a/tests/frame/to_arrow_test.py b/tests/frame/to_arrow_test.py
index 651dfabad1..497b8c56b2 100644
--- a/tests/frame/to_arrow_test.py
+++ b/tests/frame/to_arrow_test.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+from typing import Any
 
 import pyarrow as pa
 import pytest
@@ -21,7 +22,7 @@ def test_to_arrow(
         # pyarrow requires pandas>=1.0.0
         request.applymarker(pytest.mark.xfail)
 
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
+    data: dict[str, Any] = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
     df_raw = constructor_eager(data)
     result = nw.from_native(df_raw, eager_only=True).to_arrow()
 
diff --git a/tests/frame/to_pandas_test.py b/tests/frame/to_pandas_test.py
index 07b35c5872..825602b2e8 100644
--- a/tests/frame/to_pandas_test.py
+++ b/tests/frame/to_pandas_test.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+from typing import cast
 
 import pandas as pd
 import pytest
@@ -22,10 +23,10 @@ def test_convert_pandas(
 ) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df_raw = constructor_eager(data)
-    result = nw.from_native(df_raw).to_pandas()  # type: ignore[union-attr]
+    result = nw.from_native(df_raw, eager_only=True).to_pandas()
 
     if constructor_eager.__name__.startswith("pandas"):
-        expected = constructor_eager(data)
+        expected = cast(pd.DataFrame, constructor_eager(data))
     elif "modin_pyarrow" in str(constructor_eager):
         expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
     else:
diff --git a/tests/frame/to_polars_test.py b/tests/frame/to_polars_test.py
index cc2f860d99..e09c3625f7 100644
--- a/tests/frame/to_polars_test.py
+++ b/tests/frame/to_polars_test.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+from typing import Any
+from typing import Mapping
 
 import polars as pl
 import pytest
@@ -14,7 +16,7 @@
 
 @pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning")
 def test_convert_polars(constructor_eager: ConstructorEager) -> None:
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
+    data: Mapping[str, Any] = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
     df_raw = constructor_eager(data)
     result = nw.from_native(df_raw).to_polars()  # type: ignore[union-attr]
 
diff --git a/tests/group_by_test.py b/tests/group_by_test.py
index 09be5cab13..3ec5ecda11 100644
--- a/tests/group_by_test.py
+++ b/tests/group_by_test.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 from contextlib import nullcontext
+from typing import Any
+from typing import Mapping
 
 import pandas as pd
 import polars as pl
@@ -14,7 +16,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 1, 3], "b": [4, 4, 6], "c": [7.0, 8.0, 9.0]}
+data: Mapping[str, Any] = {"a": [1, 1, 3], "b": [4, 4, 6], "c": [7.0, 8.0, 9.0]}
 
 df_pandas = pd.DataFrame(data)
 df_lazy = pl.LazyFrame(data)
@@ -25,14 +27,14 @@ def test_group_by_complex() -> None:
 
     df = nw.from_native(df_pandas)
     with pytest.warns(UserWarning, match="complex group-by"):
-        result = nw.to_native(
+        result_pd = nw.to_native(
             df.group_by("a").agg((nw.col("b") - nw.col("c").mean()).mean()).sort("a")
         )
-    assert_equal_data(result, expected)
+    assert_equal_data(result_pd, expected)
 
     lf = nw.from_native(df_lazy).lazy()
-    result = lf.group_by("a").agg((nw.col("b") - nw.col("c").mean()).mean()).sort("a")
-    assert_equal_data(result, expected)
+    result_pl = lf.group_by("a").agg((nw.col("b") - nw.col("c").mean()).mean()).sort("a")
+    assert_equal_data(result_pl, expected)
 
 
 def test_invalid_group_by_dask() -> None:
diff --git a/tests/hypothesis/basic_arithmetic_test.py b/tests/hypothesis/basic_arithmetic_test.py
index 5e9b18703f..6e16b71296 100644
--- a/tests/hypothesis/basic_arithmetic_test.py
+++ b/tests/hypothesis/basic_arithmetic_test.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
+
 import pandas as pd
 import polars as pl
 import pytest
@@ -26,8 +29,9 @@
 def test_mean(
     integer: st.SearchStrategy[list[int]], floats: st.SearchStrategy[float]
 ) -> None:
-    df_pandas = pd.DataFrame({"integer": integer, "floats": floats})
-    df_polars = pl.DataFrame({"integer": integer, "floats": floats})
+    data: Mapping[str, Any] = {"integer": integer, "floats": floats}
+    df_pandas = pd.DataFrame(data)
+    df_polars = pl.DataFrame(data)
     df_nw1 = nw.from_native(df_pandas, eager_only=True)
     df_nw2 = nw.from_native(df_polars, eager_only=True)
 
diff --git a/tests/hypothesis/getitem_test.py b/tests/hypothesis/getitem_test.py
index f6cfd45897..05abdc9db2 100644
--- a/tests/hypothesis/getitem_test.py
+++ b/tests/hypothesis/getitem_test.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
+from typing import cast
 
 import hypothesis.strategies as st
 import numpy as np
@@ -233,7 +234,7 @@ def test_getitem(
         return
 
     df_other = nw.from_native(pandas_or_pyarrow_constructor(TEST_DATA))
-    result_other = df_other[selector]
+    result_other = df_other[cast("Any", selector)]
 
     if isinstance(result_polars, nw.Series):
         assert_equal_data({"a": result_other}, {"a": result_polars.to_list()})
diff --git a/tests/hypothesis/join_test.py b/tests/hypothesis/join_test.py
index 97830ab0ac..879d54ca05 100644
--- a/tests/hypothesis/join_test.py
+++ b/tests/hypothesis/join_test.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
 from typing import cast
 
 import pandas as pd
@@ -49,7 +51,7 @@ def test_join(  # pragma: no cover
     floats: st.SearchStrategy[list[float]],
     cols: st.SearchStrategy[list[str]],
 ) -> None:
-    data = {"a": integers, "b": other_integers, "c": floats}
+    data: Mapping[str, Any] = {"a": integers, "b": other_integers, "c": floats}
     join_cols = cast(list[str], cols)
 
     df_polars = pl.DataFrame(data)
@@ -96,7 +98,7 @@ def test_cross_join(  # pragma: no cover
     integers: st.SearchStrategy[list[int]],
     other_integers: st.SearchStrategy[list[int]],
 ) -> None:
-    data = {"a": integers, "b": other_integers}
+    data: Mapping[str, Any] = {"a": integers, "b": other_integers}
 
     df_polars = pl.DataFrame(data)
     df_polars2 = pl.DataFrame(data)
@@ -149,8 +151,12 @@ def test_left_join(  # pragma: no cover
     right_key: list[str],
 ) -> None:
     assume(len(left_key) == len(right_key))
-    data_left = {"a": a_left_data, "b": b_left_data, "c": c_left_data}
-    data_right = {"a": a_right_data, "b": b_right_data, "d": d_right_data}
+    data_left: dict[str, Any] = {"a": a_left_data, "b": b_left_data, "c": c_left_data}
+    data_right: dict[str, Any] = {
+        "a": a_right_data,
+        "b": b_right_data,
+        "d": d_right_data,
+    }
     result_pd = nw.from_native(pd.DataFrame(data_left), eager_only=True).join(
         nw.from_native(pd.DataFrame(data_right), eager_only=True),
         how="left",
diff --git a/tests/pickle_test.py b/tests/pickle_test.py
index 78c0debf95..677f6084d1 100644
--- a/tests/pickle_test.py
+++ b/tests/pickle_test.py
@@ -17,7 +17,7 @@ class Foo:
         a: Sequence[int]
 
     # dry-run to check that none of these error
-    asdict(Foo(pd.Series([1, 2, 3])))
+    asdict(Foo(pd.Series([1, 2, 3])))  # type: ignore[arg-type]
     asdict(Foo(pl.Series([1, 2, 3])))  # type: ignore[arg-type]
     asdict(Foo(nw.from_native(pl.Series([1, 2, 3]), series_only=True)))  # type: ignore[arg-type]
     asdict(Foo(nw.from_native(pd.Series([1, 2, 3]), series_only=True)))  # type: ignore[arg-type]
diff --git a/tests/read_scan_test.py b/tests/read_scan_test.py
index 55869b46b1..46c6d7324c 100644
--- a/tests/read_scan_test.py
+++ b/tests/read_scan_test.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+from typing import Any
+from typing import Mapping
+
 import pandas as pd
 import polars as pl
 import pytest
@@ -11,7 +14,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
+data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
 
 
 def test_read_csv(
diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py
index b4051e503c..1bedf4874f 100644
--- a/tests/series_only/cast_test.py
+++ b/tests/series_only/cast_test.py
@@ -108,13 +108,13 @@ def test_cast_to_enum() -> None:
     # we don't yet support metadata in dtypes, so for now disallow this
     # seems like a very niche use case anyway, and allowing it later wouldn't be
     # backwards-incompatible
-    df = pl.DataFrame({"a": ["a", "b"]}, schema={"a": pl.Categorical})
+    df_pl = pl.DataFrame({"a": ["a", "b"]}, schema={"a": pl.Categorical})
     with pytest.raises(
         NotImplementedError, match=r"Converting to Enum is not \(yet\) supported"
     ):
-        nw.from_native(df).select(nw.col("a").cast(nw.Enum))
-    df = pd.DataFrame({"a": ["a", "b"]}, dtype="category")
+        nw.from_native(df_pl).select(nw.col("a").cast(nw.Enum))
+    df_pd = pd.DataFrame({"a": ["a", "b"]}, dtype="category")
     with pytest.raises(
         NotImplementedError, match=r"Converting to Enum is not \(yet\) supported"
     ):
-        nw.from_native(df).select(nw.col("a").cast(nw.Enum))
+        nw.from_native(df_pd).select(nw.col("a").cast(nw.Enum))
diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py
index 498ed2349a..f736921c5a 100644
--- a/tests/tpch_q1_test.py
+++ b/tests/tpch_q1_test.py
@@ -2,6 +2,7 @@
 
 import os
 from datetime import datetime
+from typing import TYPE_CHECKING
 from unittest import mock
 
 import pandas as pd
@@ -14,6 +15,9 @@
 from tests.utils import PANDAS_VERSION
 from tests.utils import assert_equal_data
 
+if TYPE_CHECKING:
+    from narwhals.stable.v1.typing import IntoFrame
+
 
 @pytest.mark.parametrize(
     "library",
@@ -26,7 +30,7 @@ def test_q1(library: str, request: pytest.FixtureRequest) -> None:
     if library == "pandas" and PANDAS_VERSION < (1, 5):
         request.applymarker(pytest.mark.xfail)
     elif library == "pandas":
-        df_raw = pd.read_csv("tests/data/lineitem.csv")
+        df_raw: IntoFrame = pd.read_csv("tests/data/lineitem.csv")
     elif library == "polars":
         df_raw = pl.scan_csv("tests/data/lineitem.csv")
     elif library == "dask":
@@ -105,7 +109,7 @@ def test_q1_w_generic_funcs(library: str, request: pytest.FixtureRequest) -> Non
     if library == "pandas" and PANDAS_VERSION < (1, 5):
         request.applymarker(pytest.mark.xfail)
     elif library == "pandas":
-        df_raw = pd.read_csv("tests/data/lineitem.csv")
+        df_raw: IntoFrame = pd.read_csv("tests/data/lineitem.csv")
     else:
         df_raw = pl.read_csv("tests/data/lineitem.csv")
     var_1 = datetime(1998, 9, 2)
diff --git a/tests/translate/from_native_test.py b/tests/translate/from_native_test.py
index b3269b8383..6bfb2314a6 100644
--- a/tests/translate/from_native_test.py
+++ b/tests/translate/from_native_test.py
@@ -19,7 +19,7 @@
 
     from narwhals.utils import Version
 
-data = {"a": [1, 2, 3]}
+data: dict[str, Any] = {"a": [1, 2, 3]}
 
 df_pd = pd.DataFrame(data)
 df_pl = pl.DataFrame(data)
diff --git a/tests/translate/get_native_namespace_test.py b/tests/translate/get_native_namespace_test.py
index b269caef75..be10e8257f 100644
--- a/tests/translate/get_native_namespace_test.py
+++ b/tests/translate/get_native_namespace_test.py
@@ -19,11 +19,11 @@ def test_native_namespace() -> None:
     assert nw.get_native_namespace(df.to_native()) is pl
     assert nw.get_native_namespace(df.lazy().to_native()) is pl
     assert nw.get_native_namespace(df["a"].to_native()) is pl
-    df = nw.from_native(pd.DataFrame({"a": [1, 2, 3]}))
+    df = nw.from_native(pd.DataFrame({"a": [1, 2, 3]}), eager_only=True)
     assert nw.get_native_namespace(df) is pd
     assert nw.get_native_namespace(df.to_native()) is pd
     assert nw.get_native_namespace(df["a"].to_native()) is pd
-    df = nw.from_native(pa.table({"a": [1, 2, 3]}))
+    df = nw.from_native(pa.table({"a": [1, 2, 3]}), eager_only=True)
     assert nw.get_native_namespace(df) is pa
     assert nw.get_native_namespace(df.to_native()) is pa
     assert nw.get_native_namespace(df["a"].to_native()) is pa
diff --git a/tests/utils.py b/tests/utils.py
index 947db4edbb..295729c310 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -7,6 +7,7 @@
 from typing import Any
 from typing import Callable
 from typing import Iterator
+from typing import Mapping
 from typing import Sequence
 
 import pandas as pd
@@ -70,7 +71,7 @@ def _to_comparable_list(column_values: Any) -> Any:
 
 
 def _sort_dict_by_key(
-    data_dict: dict[str, list[Any]], key: str
+    data_dict: Mapping[str, list[Any]], key: str
 ) -> dict[str, list[Any]]:  # pragma: no cover
     sort_list = data_dict[key]
     sorted_indices = sorted(
@@ -84,7 +85,7 @@ def _sort_dict_by_key(
     return {key: [value[i] for i in sorted_indices] for key, value in data_dict.items()}
 
 
-def assert_equal_data(result: Any, expected: dict[str, Any]) -> None:
+def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None:
     is_pyspark = (
         hasattr(result, "_compliant_frame")
         and result.implementation is Implementation.PYSPARK
diff --git a/tests/utils_test.py b/tests/utils_test.py
index aec7c652a4..dd15d92063 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -4,6 +4,8 @@
 import string
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
+from typing import Any
+from typing import cast
 
 import hypothesis.strategies as st
 import pandas as pd
@@ -23,7 +25,7 @@
 
 if TYPE_CHECKING:
     from narwhals.series import Series
-    from narwhals.typing import IntoSeriesT
+    from narwhals.typing import IntoSeries
     from narwhals.utils import _SupportsVersion
 
 
@@ -123,17 +125,17 @@ def test_maybe_set_index_polars_column_names(
     ],
 )
 def test_maybe_set_index_pandas_direct_index(
-    narwhals_index: Series[IntoSeriesT] | list[Series[IntoSeriesT]] | None,
-    pandas_index: pd.Series | list[pd.Series] | None,
+    narwhals_index: Series[IntoSeries] | list[Series[IntoSeries]],
+    pandas_index: pd.Series | list[pd.Series],
     native_df_or_series: pd.DataFrame | pd.Series,
 ) -> None:
     df = nw.from_native(native_df_or_series, allow_series=True)
     result = nw.maybe_set_index(df, index=narwhals_index)
     if isinstance(native_df_or_series, pd.Series):
-        native_df_or_series.index = pandas_index
+        native_df_or_series.index = pandas_index  # type: ignore[assignment]
         assert_series_equal(nw.to_native(result), native_df_or_series)
     else:
-        expected = native_df_or_series.set_index(pandas_index)
+        expected = native_df_or_series.set_index(pandas_index)  # type: ignore[type-var]
         assert_frame_equal(nw.to_native(result), expected)
 
 
@@ -148,7 +150,7 @@ def test_maybe_set_index_pandas_direct_index(
     ],
 )
 def test_maybe_set_index_polars_direct_index(
-    index: Series[IntoSeriesT] | list[Series[IntoSeriesT]] | None,
+    index: Series[IntoSeries] | list[Series[IntoSeries]] | None,
 ) -> None:
     df = nw.from_native(pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}))
     result = nw.maybe_set_index(df, index=index)
@@ -179,10 +181,13 @@ def test_maybe_set_index_pandas_either_index_or_column_names() -> None:
 
 def test_maybe_get_index_pandas() -> None:
     pandas_df = pd.DataFrame({"a": [1, 2, 3]}, index=[1, 2, 0])
-    result = nw.maybe_get_index(nw.from_native(pandas_df))
+    result = cast("pd.Index[Any]", nw.maybe_get_index(nw.from_native(pandas_df)))
     assert_index_equal(result, pandas_df.index)
     pandas_series = pd.Series([1, 2, 3], index=[1, 2, 0])
-    result_s = nw.maybe_get_index(nw.from_native(pandas_series, series_only=True))
+    result_s = cast(
+        "pd.Index[Any]",
+        nw.maybe_get_index(nw.from_native(pandas_series, series_only=True)),
+    )
     assert_index_equal(result_s, pandas_series.index)