diff --git a/docs/release-notes/3351.bugfix.md b/docs/release-notes/3351.bugfix.md new file mode 100644 index 0000000000..ba7d435bff --- /dev/null +++ b/docs/release-notes/3351.bugfix.md @@ -0,0 +1 @@ +Fix zappy compatibility for clip_array {smaller}`P Angerer` diff --git a/src/scanpy/preprocessing/_scale.py b/src/scanpy/preprocessing/_scale.py index 4e64c7ba9b..bbdd95dfac 100644 --- a/src/scanpy/preprocessing/_scale.py +++ b/src/scanpy/preprocessing/_scale.py @@ -10,7 +10,7 @@ from anndata import AnnData from .. import logging as logg -from .._compat import CSBase, CSCBase, DaskArray, njit, old_positionals +from .._compat import CSBase, CSCBase, CSRBase, DaskArray, njit, old_positionals from .._utils import ( _check_array_function_arguments, axis_mul_or_truediv, @@ -28,18 +28,29 @@ da = None if TYPE_CHECKING: - from numpy.typing import NDArray + from typing import TypeVar + from numpy.typing import ArrayLike, NDArray -@njit -def _scale_sparse_numba(indptr, indices, data, *, std, mask_obs, clip): - for i in numba.prange(len(indptr) - 1): - if mask_obs[i]: - for j in range(indptr[i], indptr[i + 1]): - if clip: - data[j] = min(clip, data[j] / std[indices[j]]) - else: - data[j] /= std[indices[j]] + _A = TypeVar("_A", bound=CSBase | np.ndarray | DaskArray) + + +@singledispatch +def clip(x: ArrayLike | _A, *, max_value: float, zero_center: bool = True) -> _A: + return clip_array(x, max_value=max_value, zero_center=zero_center) + + +@clip.register(CSBase) +def _(x: CSBase, *, max_value: float, zero_center: bool = True) -> CSBase: + x.data = clip(x.data, max_value=max_value, zero_center=zero_center) + return x + + +@clip.register(DaskArray) +def _(x: DaskArray, *, max_value: float, zero_center: bool = True) -> DaskArray: + return x.map_blocks( + clip, max_value=max_value, zero_center=zero_center, dtype=x.dtype, meta=x._meta + ) @njit @@ -62,19 +73,11 @@ def clip_array( return X -def clip_set(x: CSBase, *, max_value: float, zero_center: bool = True) -> CSBase: - x = x.copy() - x[x > max_value] = max_value - if zero_center: - x[x < -max_value] = -max_value - return x - - @renamed_arg("X", "data", pos_0=True) @old_positionals("zero_center", "max_value", "copy", "layer", "obsm") @singledispatch def scale( - data: AnnData | CSBase | np.ndarray | DaskArray, + data: AnnData | _A, *, zero_center: bool = True, max_value: float | None = None, @@ -82,7 +85,7 @@ def scale( layer: str | None = None, obsm: str | None = None, mask_obs: NDArray[np.bool_] | str | None = None, -) -> AnnData | CSBase | np.ndarray | DaskArray | None: +) -> AnnData | _A | None: """Scale data to unit variance and zero mean. .. note:: @@ -117,7 +120,7 @@ def scale( ------- Returns `None` if `copy=False`, else returns an updated `AnnData` object. Sets the following fields: - `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse._csr.csr_matrix` (dtype `float`) + `adata.X` | `adata.layers[layer]` : :class:`numpy.ndarray` | :class:`scipy.sparse.csr_matrix` (dtype `float`) Scaled count data matrix. `adata.var['mean']` : :class:`pandas.Series` (dtype `float`) Means per gene before scaling. @@ -141,8 +144,9 @@ def scale( @scale.register(np.ndarray) @scale.register(DaskArray) -def scale_array( # noqa: PLR0912 - X: np.ndarray | DaskArray, +@scale.register(CSBase) +def scale_array( + x: _A, *, zero_center: bool = True, max_value: float | None = None, @@ -150,141 +154,133 @@ def scale_array( # noqa: PLR0912 return_mean_std: bool = False, mask_obs: NDArray[np.bool_] | None = None, ) -> ( - np.ndarray - | DaskArray + _A | tuple[ - np.ndarray | DaskArray, NDArray[np.float64] | DaskArray, NDArray[np.float64] + _A, + NDArray[np.float64] | DaskArray, + NDArray[np.float64], ] ): if copy: - X = X.copy() - mask_obs = _check_mask(X, mask_obs, "obs") - if mask_obs is not None: - scale_rv = scale_array( - X[mask_obs, :], - zero_center=zero_center, - max_value=max_value, - copy=False, - return_mean_std=return_mean_std, - mask_obs=None, - ) - - if return_mean_std: - X[mask_obs, :], mean, std = scale_rv - return X, mean, std - else: - X[mask_obs, :] = scale_rv - return X + x = x.copy() if not zero_center and max_value is not None: logg.info( # Be careful of what? This should be more specific "... be careful when using `max_value` without `zero_center`." ) - if np.issubdtype(X.dtype, np.integer): + if np.issubdtype(x.dtype, np.integer): logg.info( "... as scaling leads to float results, integer " "input is cast to float, returning copy." ) - X = X.astype(float) + x = x.astype(np.float64) + + mask_obs = ( + # For CSR matrices, default to a set mask to take the `scale_array_masked` path. + # This is faster than the maskless `axis_mul_or_truediv` path. + np.ones(x.shape[0], dtype=np.bool_) + if isinstance(x, CSRBase) and mask_obs is None and not zero_center + else _check_mask(x, mask_obs, "obs") + ) + if mask_obs is not None: + return scale_array_masked( + x, + mask_obs, + zero_center=zero_center, + max_value=max_value, + return_mean_std=return_mean_std, + ) - mean, var = _get_mean_var(X) + mean, var = _get_mean_var(x) std = np.sqrt(var) std[std == 0] = 1 if zero_center: - if isinstance(X, DaskArray) and isinstance(X._meta, CSBase): - warnings.warn( - "zero-center being used with `DaskArray` sparse chunks. " - "This can be bad if you have large chunks or intend to eventually read the whole data into memory.", - UserWarning, - stacklevel=2, - ) - X -= mean - - out = X if isinstance(X, np.ndarray | CSBase) else None - X = axis_mul_or_truediv(X, std, op=truediv, out=out, axis=1) + if isinstance(x, CSBase) or ( + isinstance(x, DaskArray) and isinstance(x._meta, CSBase) + ): + msg = "zero-centering a sparse array/matrix densifies it." + warnings.warn(msg, UserWarning, stacklevel=2) + x -= mean + + x = axis_mul_or_truediv( + x, + std, + op=truediv, + out=x if isinstance(x, np.ndarray | CSBase) else None, + axis=1, + ) # do the clipping if max_value is not None: - logg.debug(f"... clipping at max_value {max_value}") - if isinstance(X, DaskArray): - clip = clip_set if isinstance(X._meta, CSBase) else clip_array - X = X.map_blocks(clip, max_value=max_value, zero_center=zero_center) - elif isinstance(X, CSBase): - X.data = clip_array(X.data, max_value=max_value, zero_center=False) - else: - X = clip_array(X, max_value=max_value, zero_center=zero_center) + x = clip(x, max_value=max_value, zero_center=zero_center) if return_mean_std: - return X, mean, std + return x, mean, std else: - return X + return x -@scale.register(CSBase) -def scale_sparse( - X: CSBase, +def scale_array_masked( + x: _A, + mask_obs: NDArray[np.bool_], *, zero_center: bool = True, max_value: float | None = None, - copy: bool = False, return_mean_std: bool = False, - mask_obs: NDArray[np.bool_] | None = None, -) -> np.ndarray | tuple[np.ndarray, NDArray[np.float64], NDArray[np.float64]]: - # need to add the following here to make inplace logic work - if zero_center: - logg.info( - "... as `zero_center=True`, sparse input is " - "densified and may lead to large memory consumption" - ) - X = X.toarray() - copy = False # Since the data has been copied - return scale_array( - X, - zero_center=zero_center, - copy=copy, - max_value=max_value, - return_mean_std=return_mean_std, +) -> ( + _A + | tuple[ + _A, + NDArray[np.float64] | DaskArray, + NDArray[np.float64], + ] +): + if isinstance(x, CSBase) and not zero_center: + if isinstance(x, CSCBase): + x = x.tocsr() + mean, var = _get_mean_var(x[mask_obs, :]) + std = np.sqrt(var) + std[std == 0] = 1 + + scale_and_clip_csr( + x.indptr, + x.indices, + x.data, + std=std, mask_obs=mask_obs, + max_value=max_value, ) - elif mask_obs is None: - return scale_array( - X, + else: + x[mask_obs, :], mean, std = scale_array( + x[mask_obs, :], zero_center=zero_center, - copy=copy, max_value=max_value, - return_mean_std=return_mean_std, - mask_obs=mask_obs, + return_mean_std=True, ) - else: - if isinstance(X, CSCBase): - X = X.tocsr() - elif copy: - X = X.copy() - - if mask_obs is not None: - mask_obs = _check_mask(X, mask_obs, "obs") - - mean, var = _get_mean_var(X[mask_obs, :]) - - std = np.sqrt(var) - std[std == 0] = 1 - - if max_value is None: - max_value = 0 - - _scale_sparse_numba( - X.indptr, - X.indices, - X.data, - std=std.astype(X.dtype), - mask_obs=mask_obs, - clip=max_value, - ) if return_mean_std: - return X, mean, std + return x, mean, std else: - return X + return x + + +@njit +def scale_and_clip_csr( + indptr: NDArray[np.integer], + indices: NDArray[np.integer], + data: NDArray[np.floating], + *, + std: NDArray[np.floating], + mask_obs: NDArray[np.bool_], + max_value: float | None, +) -> None: + for i in numba.prange(len(indptr) - 1): + if mask_obs[i]: + for j in range(indptr[i], indptr[i + 1]): + if max_value is not None: + data[j] = min(max_value, data[j] / std[indices[j]]) + else: + data[j] /= std[indices[j]] @scale.register(AnnData) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 75312ee01b..2473efdd4c 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -23,7 +23,7 @@ maybe_dask_process_context, ) from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced -from testing.scanpy._pytest.params import ARRAY_TYPES +from testing.scanpy._pytest.params import ARRAY_TYPES, ARRAY_TYPES_SPARSE if TYPE_CHECKING: from collections.abc import Callable @@ -318,18 +318,13 @@ def test_scale_matrix_types(array_type, zero_center, max_value): assert_allclose(X, adata.X, rtol=1e-5, atol=1e-5) -ARRAY_TYPES_DASK_SPARSE = [ - a for a in ARRAY_TYPES if "sparse" in a.id and "dask" in a.id -] - - -@pytest.mark.parametrize("array_type", ARRAY_TYPES_DASK_SPARSE) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SPARSE) def test_scale_zero_center_warns_dask_sparse(array_type): adata = pbmc68k_reduced() adata.X = adata.raw.X adata_casted = adata.copy() adata_casted.X = array_type(adata_casted.raw.X) - with pytest.warns(UserWarning, match="zero-center being used with `DaskArray`*"): + with pytest.warns(UserWarning, match="zero-center.*sparse"): sc.pp.scale(adata_casted) sc.pp.scale(adata) assert_allclose(adata_casted.X, adata.X, rtol=1e-5, atol=1e-5) diff --git a/tests/test_scaling.py b/tests/test_scaling.py index 1de9073955..cc7078c986 100644 --- a/tests/test_scaling.py +++ b/tests/test_scaling.py @@ -73,46 +73,36 @@ [np.array, sparse.csr_matrix, sparse.csc_matrix], # noqa: TID251 ids=lambda x: x.__name__, ) -@pytest.mark.parametrize("dtype", ["float32", "int64"]) +@pytest.mark.parametrize("container", ["anndata", "array"]) +@pytest.mark.parametrize("dtype", [np.float32, np.int64]) +@pytest.mark.parametrize("zero_center", [True, False], ids=["center", "no_center"]) @pytest.mark.parametrize( - ("mask_obs", "X", "X_centered", "X_scaled"), + ("mask_obs", "x", "x_centered", "x_scaled"), [ - (None, X_original, X_centered_original, X_scaled_original), - ( + pytest.param( + None, X_original, X_centered_original, X_scaled_original, id="no_mask" + ), + pytest.param( np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool), X_for_mask, X_centered_for_mask, X_scaled_for_mask, + id="mask", ), ], ) -def test_scale(*, typ, dtype, mask_obs, X, X_centered, X_scaled): - # test AnnData arguments - # test scaling with default zero_center == True - adata0 = AnnData(typ(X).astype(dtype)) - sc.pp.scale(adata0, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(adata0.X).toarray(), X_centered) # noqa: TID251 - # test scaling with explicit zero_center == True - adata1 = AnnData(typ(X).astype(dtype)) - sc.pp.scale(adata1, zero_center=True, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(adata1.X).toarray(), X_centered) # noqa: TID251 - # test scaling with explicit zero_center == False - adata2 = AnnData(typ(X).astype(dtype)) - sc.pp.scale(adata2, zero_center=False, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(adata2.X).toarray(), X_scaled) # noqa: TID251 - # test bare count arguments, for simplicity only with explicit copy=True - # test scaling with default zero_center == True - data0 = typ(X, dtype=dtype) - cdata0 = sc.pp.scale(data0, copy=True, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(cdata0).toarray(), X_centered) # noqa: TID251 - # test scaling with explicit zero_center == True - data1 = typ(X, dtype=dtype) - cdata1 = sc.pp.scale(data1, zero_center=True, copy=True, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(cdata1).toarray(), X_centered) # noqa: TID251 - # test scaling with explicit zero_center == False - data2 = typ(X, dtype=dtype) - cdata2 = sc.pp.scale(data2, zero_center=False, copy=True, mask_obs=mask_obs) - assert np.allclose(sparse.csr_matrix(cdata2).toarray(), X_scaled) # noqa: TID251 +def test_scale( + *, typ, container, zero_center, dtype, mask_obs, x, x_centered, x_scaled +): + x = AnnData(typ(x, dtype=dtype)) if container == "anndata" else typ(x, dtype=dtype) + scaled = sc.pp.scale( + x, zero_center=zero_center, copy=container == "array", mask_obs=mask_obs + ) + received = sparse.csr_matrix( # noqa: TID251 + x.X if scaled is None else scaled + ).toarray() + expected = x_centered if zero_center else x_scaled + assert np.allclose(received, expected) def test_mask_string():