Skip to content

Commit 0f9ff18

Browse files
meeseeksmachineilan-goldflying-sheep
authored
Backport PR #3048: (feat): raising errors where backed is not supported (#3072)
Co-authored-by: Ilan Gold <[email protected]> Co-authored-by: Philipp A <[email protected]>
1 parent 30aa230 commit 0f9ff18

File tree

14 files changed

+229
-10
lines changed

14 files changed

+229
-10
lines changed

docs/release-notes/1.10.2.md

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
```
1818

1919
* Compatibility with `matplotlib` 3.9 {pr}`2999` {smaller}`I Virshup`
20+
* Add clear errors where `backed` mode-like matrices (i.e., from `sparse_dataset`) are not supported {pr}`3048` {smaller}`I gold`
2021
* Fix deprecated use of `.A` with sparse matrices {pr}`3084` {smaller}`P Angerer`
2122
* Fix zappy support {pr}`3089` {smaller}`P Angerer`
2223

scanpy/_utils/__init__.py

+19
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from typing import TYPE_CHECKING, overload
2323
from weakref import WeakSet
2424

25+
import h5py
2526
import numpy as np
2627
from anndata import __version__ as anndata_version
2728
from packaging.version import Version
@@ -33,6 +34,13 @@
3334
from .._settings import settings
3435
from .compute.is_constant import is_constant # noqa: F401
3536

37+
if Version(anndata_version) >= Version("0.10.0"):
38+
from anndata._core.sparse_dataset import (
39+
BaseCompressedSparseDataset as SparseDataset,
40+
)
41+
else:
42+
from anndata._core.sparse_dataset import SparseDataset
43+
3644
if TYPE_CHECKING:
3745
from collections.abc import Mapping
3846
from pathlib import Path
@@ -1085,3 +1093,14 @@ def _resolve_axis(
10851093
if axis in {1, "var"}:
10861094
return (1, "var")
10871095
raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}")
1096+
1097+
1098+
def is_backed_type(X: object) -> bool:
1099+
return isinstance(X, (SparseDataset, h5py.File, h5py.Dataset))
1100+
1101+
1102+
def raise_not_implemented_error_if_backed_type(X: object, method_name: str) -> None:
1103+
if is_backed_type(X):
1104+
raise NotImplementedError(
1105+
f"{method_name} is not implemented for matrices of type {type(X)}"
1106+
)

scanpy/preprocessing/_pca.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .. import logging as logg
1717
from .._compat import DaskArray, pkg_version
1818
from .._settings import settings
19-
from .._utils import _doc_params, _empty
19+
from .._utils import _doc_params, _empty, is_backed_type
2020
from ..get import _check_mask, _get_obs_rep
2121
from ._docs import doc_mask_var_hvg
2222
from ._utils import _get_mean_var
@@ -173,6 +173,10 @@ def pca(
173173
)
174174
data_is_AnnData = isinstance(data, AnnData)
175175
if data_is_AnnData:
176+
if layer is None and not chunked and is_backed_type(data.X):
177+
raise NotImplementedError(
178+
f"PCA is not implemented for matrices of type {type(data.X)} with chunked as False"
179+
)
176180
adata = data.copy() if copy else data
177181
else:
178182
if pkg_version("anndata") < Version("0.8.0rc1"):
@@ -195,7 +199,10 @@ def pca(
195199
logg.info(f" with n_comps={n_comps}")
196200

197201
X = _get_obs_rep(adata_comp, layer=layer)
198-
202+
if is_backed_type(X) and layer is not None:
203+
raise NotImplementedError(
204+
f"PCA is not implemented for matrices of type {type(X)} from layers"
205+
)
199206
# See: https://github.com/scverse/scanpy/pull/2816#issuecomment-1932650529
200207
if (
201208
Version(ad.__version__) < Version("0.9")

scanpy/preprocessing/_scale.py

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .._utils import (
1616
_check_array_function_arguments,
1717
axis_mul_or_truediv,
18+
raise_not_implemented_error_if_backed_type,
1819
renamed_arg,
1920
view_to_actual,
2021
)
@@ -298,6 +299,7 @@ def scale_anndata(
298299
mask_obs = _check_mask(adata, mask_obs, "obs")
299300
view_to_actual(adata)
300301
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
302+
raise_not_implemented_error_if_backed_type(X, "scale")
301303
X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
302304
X,
303305
zero_center=zero_center,

scanpy/preprocessing/_simple.py

+15
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from .._utils import (
2424
_check_array_function_arguments,
2525
axis_sum,
26+
is_backed_type,
27+
raise_not_implemented_error_if_backed_type,
2628
renamed_arg,
2729
sanitize_anndata,
2830
view_to_actual,
@@ -145,6 +147,7 @@ def filter_cells(
145147
"`min_genes`, `max_counts`, `max_genes` per call."
146148
)
147149
if isinstance(data, AnnData):
150+
raise_not_implemented_error_if_backed_type(data.X, "filter_cells")
148151
adata = data.copy() if copy else data
149152
cell_subset, number = materialize_as_ndarray(
150153
filter_cells(
@@ -260,6 +263,7 @@ def filter_genes(
260263
)
261264

262265
if isinstance(data, AnnData):
266+
raise_not_implemented_error_if_backed_type(data.X, "filter_genes")
263267
adata = data.copy() if copy else data
264268
gene_subset, number = materialize_as_ndarray(
265269
filter_genes(
@@ -405,10 +409,19 @@ def log1p_anndata(
405409
raise NotImplementedError(
406410
"Currently cannot perform chunked operations on arrays not stored in X."
407411
)
412+
if adata.isbacked and adata.file._filemode != "r+":
413+
raise NotImplementedError(
414+
"log1p is not implemented for backed AnnData with backed mode not r+"
415+
)
408416
for chunk, start, end in adata.chunked_X(chunk_size):
409417
adata.X[start:end] = log1p(chunk, base=base, copy=False)
410418
else:
411419
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
420+
if is_backed_type(X):
421+
msg = f"log1p is not implemented for matrices of type {type(X)}"
422+
if layer is not None:
423+
raise NotImplementedError(f"{msg} from layers")
424+
raise NotImplementedError(f"{msg} without `chunked=True`")
412425
X = log1p(X, copy=False, base=base)
413426
_set_obs_rep(adata, X, layer=layer, obsm=obsm)
414427

@@ -647,6 +660,7 @@ def regress_out(
647660
keys = [keys]
648661

649662
X = _get_obs_rep(adata, layer=layer)
663+
raise_not_implemented_error_if_backed_type(X, "regress_out")
650664

651665
if issparse(X):
652666
logg.info(" sparse input is densified and may " "lead to high memory use")
@@ -855,6 +869,7 @@ def downsample_counts(
855869
`adata.X` : :class:`numpy.ndarray` | :class:`scipy.sparse.spmatrix` (dtype `float`)
856870
Downsampled counts matrix.
857871
"""
872+
raise_not_implemented_error_if_backed_type(adata.X, "downsample_counts")
858873
# This logic is all dispatch
859874
total_counts_call = total_counts is not None
860875
counts_per_cell_call = counts_per_cell is not None

scanpy/tests/test_backed.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from __future__ import annotations
2+
3+
from functools import partial
4+
5+
import pytest
6+
from anndata import read_h5ad
7+
8+
import scanpy as sc
9+
10+
11+
@pytest.mark.parametrize(
12+
("name", "func", "msg"),
13+
[
14+
pytest.param("PCA", sc.pp.pca, " with chunked as False", id="pca"),
15+
pytest.param(
16+
"PCA", partial(sc.pp.pca, layer="X_copy"), " from layers", id="pca_layer"
17+
),
18+
pytest.param(
19+
"regress_out",
20+
partial(sc.pp.regress_out, keys=["n_counts", "percent_mito"]),
21+
"",
22+
id="regress_out",
23+
),
24+
pytest.param(
25+
"dendrogram", partial(sc.tl.dendrogram, groupby="cat"), "", id="dendrogram"
26+
),
27+
pytest.param("tsne", sc.tl.tsne, "", id="tsne"),
28+
pytest.param("scale", sc.pp.scale, "", id="scale"),
29+
pytest.param(
30+
"downsample_counts",
31+
partial(sc.pp.downsample_counts, counts_per_cell=1000),
32+
"",
33+
id="downsample_counts",
34+
),
35+
pytest.param(
36+
"filter_genes",
37+
partial(sc.pp.filter_genes, max_cells=1000),
38+
"",
39+
id="filter_genes",
40+
),
41+
pytest.param(
42+
"filter_cells",
43+
partial(sc.pp.filter_cells, max_genes=1000),
44+
"",
45+
id="filter_cells",
46+
),
47+
pytest.param(
48+
"rank_genes_groups",
49+
partial(sc.tl.rank_genes_groups, groupby="cat"),
50+
"",
51+
id="rank_genes_groups",
52+
),
53+
pytest.param(
54+
"score_genes",
55+
partial(sc.tl.score_genes, gene_list=map(str, range(100))),
56+
"",
57+
id="score_genes",
58+
),
59+
],
60+
)
61+
def test_backed_error(backed_adata, name, func, msg):
62+
with pytest.raises(
63+
NotImplementedError,
64+
match=f"{name} is not implemented for matrices of type {type(backed_adata.X)}{msg}",
65+
):
66+
func(backed_adata)
67+
68+
69+
def test_log1p_backed_errors(backed_adata):
70+
with pytest.raises(
71+
NotImplementedError,
72+
match="log1p is not implemented for backed AnnData with backed mode not r+",
73+
):
74+
sc.pp.log1p(backed_adata, chunked=True)
75+
backed_adata.file.close()
76+
backed_adata = read_h5ad(backed_adata.filename, backed="r+")
77+
with pytest.raises(
78+
NotImplementedError,
79+
match=f"log1p is not implemented for matrices of type {type(backed_adata.X)} without `chunked=True`",
80+
):
81+
sc.pp.log1p(backed_adata)
82+
backed_adata.layers["X_copy"] = backed_adata.X
83+
layer_type = type(backed_adata.layers["X_copy"])
84+
with pytest.raises(
85+
NotImplementedError,
86+
match=f"log1p is not implemented for matrices of type {layer_type} from layers",
87+
):
88+
sc.pp.log1p(backed_adata, layer="X_copy")
89+
backed_adata.file.close()
90+
91+
92+
def test_scatter_backed(backed_adata):
93+
sc.pp.pca(backed_adata, chunked=True)
94+
sc.pl.scatter(backed_adata, color="0", basis="pca")
95+
96+
97+
def test_dotplot_backed(backed_adata):
98+
sc.pl.dotplot(backed_adata, ["0", "1", "2", "3"], groupby="cat")

scanpy/tests/test_ingest.py

+17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import anndata
34
import numpy as np
45
import pytest
56
from sklearn.neighbors import KDTree
@@ -153,3 +154,19 @@ def test_ingest_map_embedding_umap():
153154
umap_transformed_t = reducer.transform(T)
154155

155156
assert np.allclose(ing._obsm["X_umap"], umap_transformed_t)
157+
158+
159+
def test_ingest_backed(adatas, tmp_path):
160+
adata_ref = adatas[0].copy()
161+
adata_new = adatas[1].copy()
162+
163+
adata_new.write_h5ad(f"{tmp_path}/new.h5ad")
164+
165+
adata_new = anndata.read_h5ad(f"{tmp_path}/new.h5ad", backed="r")
166+
167+
ing = sc.tl.Ingest(adata_ref)
168+
with pytest.raises(
169+
NotImplementedError,
170+
match=f"Ingest.fit is not implemented for matrices of type {type(adata_new.X)}",
171+
):
172+
ing.fit(adata_new)

scanpy/tools/_dendrogram.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from .. import logging as logg
1313
from .._compat import old_positionals
14-
from .._utils import _doc_params
14+
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
1515
from ..neighbors._doc import doc_n_pcs, doc_use_rep
1616
from ._utils import _choose_representation
1717

@@ -117,6 +117,8 @@ def dendrogram(
117117
>>> markers = ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ']
118118
>>> sc.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True)
119119
"""
120+
121+
raise_not_implemented_error_if_backed_type(adata.X, "dendrogram")
120122
if isinstance(groupby, str):
121123
# if not a list, turn into a list
122124
groupby = [groupby]

scanpy/tools/_ingest.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from .. import logging as logg
1313
from .._compat import old_positionals, pkg_version
1414
from .._settings import settings
15-
from .._utils import NeighborsView
15+
from .._utils import NeighborsView, raise_not_implemented_error_if_backed_type
1616
from .._utils._doctests import doctest_skip
1717
from ..neighbors import FlatTree
1818

@@ -392,6 +392,7 @@ def fit(self, adata_new):
392392
`adata` refers to the :class:`~anndata.AnnData` object
393393
that is passed during the initialization of an Ingest instance.
394394
"""
395+
raise_not_implemented_error_if_backed_type(adata_new.X, "Ingest.fit")
395396
ref_var_names = self._adata_ref.var_names.str.upper()
396397
new_var_names = adata_new.var_names.str.upper()
397398

scanpy/tools/_rank_genes_groups.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
from .. import _utils
1313
from .. import logging as logg
1414
from .._compat import old_positionals
15-
from .._utils import check_nonnegative_integers
15+
from .._utils import (
16+
check_nonnegative_integers,
17+
raise_not_implemented_error_if_backed_type,
18+
)
1619
from ..get import _check_mask
1720
from ..preprocessing._utils import _get_mean_var
1821

@@ -134,6 +137,7 @@ def __init__(
134137
if use_raw and adata.raw is not None:
135138
adata_comp = adata.raw
136139
X = adata_comp.X
140+
raise_not_implemented_error_if_backed_type(X, "rank_genes_groups")
137141

138142
# for correct getnnz calculation
139143
if issparse(X):
@@ -594,7 +598,6 @@ def rank_genes_groups(
594598
>>> # to visualize the results
595599
>>> sc.pl.rank_genes_groups(adata)
596600
"""
597-
598601
if mask_var is not None:
599602
mask_var = _check_mask(adata, mask_var, "var")
600603

scanpy/tools/_score_genes.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pandas as pd
99
from scipy.sparse import issparse
1010

11-
from scanpy._utils import _check_use_raw
11+
from scanpy._utils import _check_use_raw, is_backed_type
1212

1313
from .. import logging as logg
1414
from .._compat import old_positionals
@@ -115,6 +115,10 @@ def score_genes(
115115
start = logg.info(f"computing score {score_name!r}")
116116
adata = adata.copy() if copy else adata
117117
use_raw = _check_use_raw(adata, use_raw)
118+
if is_backed_type(adata.X) and not use_raw:
119+
raise NotImplementedError(
120+
f"score_genes is not implemented for matrices of type {type(adata.X)}"
121+
)
118122

119123
if random_state is not None:
120124
np.random.seed(random_state)

scanpy/tools/_tsne.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .. import logging as logg
99
from .._compat import old_positionals
1010
from .._settings import settings
11-
from .._utils import _doc_params
11+
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
1212
from ..neighbors._doc import doc_n_pcs, doc_use_rep
1313
from ._utils import _choose_representation
1414

@@ -106,6 +106,7 @@ def tsne(
106106
start = logg.info("computing tSNE")
107107
adata = adata.copy() if copy else adata
108108
X = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
109+
raise_not_implemented_error_if_backed_type(X, "tsne")
109110
# params for sklearn
110111
n_jobs = settings.n_jobs if n_jobs is None else n_jobs
111112
params_sklearn = dict(

src/testing/scanpy/_pytest/fixtures/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from .data import (
1515
_pbmc3ks_parametrized_session,
16+
backed_adata,
1617
pbmc3k_parametrized,
1718
pbmc3k_parametrized_small,
1819
)
@@ -27,6 +28,7 @@
2728
"_pbmc3ks_parametrized_session",
2829
"pbmc3k_parametrized",
2930
"pbmc3k_parametrized_small",
31+
"backed_adata",
3032
]
3133

3234

0 commit comments

Comments
 (0)