diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 1ec082c832..a5971f0a76 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -199,7 +199,7 @@ def _highly_variable_pearson_residuals( sums_genes = np.array(X_batch.sum(axis=0)).ravel() sums_cells = np.array(X_batch.sum(axis=1)).ravel() - sum_total = np.sum(sums_genes).ravel() + sum_total = np.sum(sums_genes) residual_gene_var = calculate_res( sums_genes=sums_genes, diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 587f24cff6..5dd28c8748 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -73,7 +73,7 @@ def _highly_variable_genes_seurat_v3( if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) else: - batch_info = adata.obs[batch_key].values + batch_info = adata.obs[batch_key].to_numpy() norm_gene_vars = [] for b in np.unique(batch_info): @@ -159,24 +159,24 @@ def _highly_variable_genes_seurat_v3( " 'variances', float vector (adata.var)\n" " 'variances_norm', float vector (adata.var)" ) - adata.var["highly_variable"] = df["highly_variable"].values - adata.var["highly_variable_rank"] = df["highly_variable_rank"].values - adata.var["means"] = df["means"].values - adata.var["variances"] = df["variances"].values - adata.var["variances_norm"] = df["variances_norm"].values.astype( - "float64", copy=False + adata.var["highly_variable"] = df["highly_variable"].to_numpy() + adata.var["highly_variable_rank"] = df["highly_variable_rank"].to_numpy() + adata.var["means"] = df["means"].to_numpy() + adata.var["variances"] = df["variances"].to_numpy() + adata.var["variances_norm"] = ( + df["variances_norm"].to_numpy().astype("float64", copy=False) ) if batch_key is not None: adata.var["highly_variable_nbatches"] = df[ "highly_variable_nbatches" - ].values + ].to_numpy() if subset: - adata._inplace_subset_var(df["highly_variable"].values) + adata._inplace_subset_var(df["highly_variable"].to_numpy()) else: if batch_key is None: df = df.drop(["highly_variable_nbatches"], axis=1) if subset: - df = df.iloc[df.highly_variable.values, :] + df = df.iloc[df["highly_variable"].to_numpy(), :] return df @@ -233,7 +233,7 @@ def _highly_variable_genes_single_batch( # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() - gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist() + gen_indices = np.where(one_gene_per_bin[df["mean_bin"].to_numpy()])[0].tolist() if len(gen_indices) > 0: logg.debug( f"Gene indices {gen_indices} fell into a single bin: their " @@ -242,15 +242,15 @@ def _highly_variable_genes_single_batch( ) # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. - disp_std_bin[one_gene_per_bin.values] = disp_mean_bin[ - one_gene_per_bin.values - ].values - disp_mean_bin[one_gene_per_bin.values] = 0 + disp_std_bin[one_gene_per_bin.to_numpy()] = disp_mean_bin[ + one_gene_per_bin.to_numpy() + ].to_numpy() + disp_mean_bin[one_gene_per_bin.to_numpy()] = 0 # actually do the normalization df["dispersions_norm"] = ( - df["dispersions"].values # use values here as index differs - - disp_mean_bin[df["mean_bin"].values].values - ) / disp_std_bin[df["mean_bin"].values].values + df["dispersions"].to_numpy() # use values here as index differs + - disp_mean_bin[df["mean_bin"].to_numpy()].to_numpy() + ) / disp_std_bin[df["mean_bin"].to_numpy()].to_numpy() elif flavor == "cell_ranger": from statsmodels import robust @@ -265,11 +265,12 @@ def _highly_variable_genes_single_batch( warnings.simplefilter("ignore") disp_mad_bin = disp_grouped.apply(robust.mad) df["dispersions_norm"] = ( - df["dispersions"].values - disp_median_bin[df["mean_bin"].values].values - ) / disp_mad_bin[df["mean_bin"].values].values + df["dispersions"].to_numpy() + - disp_median_bin[df["mean_bin"].to_numpy()].to_numpy() + ) / disp_mad_bin[df["mean_bin"].to_numpy()].to_numpy() else: raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"') - dispersion_norm = df["dispersions_norm"].values + dispersion_norm = df["dispersions_norm"].to_numpy() if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[ @@ -285,7 +286,7 @@ def _highly_variable_genes_single_batch( ) n_top_genes = dispersion_norm.size disp_cut_off = dispersion_norm[n_top_genes - 1] - gene_subset = np.nan_to_num(df["dispersions_norm"].values) >= disp_cut_off + gene_subset = np.nan_to_num(df["dispersions_norm"].to_numpy()) >= disp_cut_off logg.debug( f"the {n_top_genes} top genes correspond to a " f"normalized dispersion cutoff of {disp_cut_off}" @@ -508,7 +509,7 @@ def highly_variable_genes( flavor=flavor, ) - hvg["gene"] = adata_subset.var_names.values + hvg["gene"] = adata_subset.var_names.to_numpy() if (n_removed := np.sum(~filt)) > 0: # Add 0 values for genes that were filtered out missing_hvg = pd.DataFrame( @@ -559,14 +560,14 @@ def highly_variable_genes( df = df.loc[adata.var_names, :] else: df = df.loc[adata.var_names] - dispersion_norm = df.dispersions_norm.values + dispersion_norm = df["dispersions_norm"].to_numpy() dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce( ( - df.means > min_mean, - df.means < max_mean, - df.dispersions_norm > min_disp, - df.dispersions_norm < max_disp, + df["means"] > min_mean, + df["means"] < max_mean, + df["dispersions_norm"] > min_disp, + df["dispersions_norm"] < max_disp, ) ) df["highly_variable"] = gene_subset @@ -582,25 +583,25 @@ def highly_variable_genes( " 'dispersions', float vector (adata.var)\n" " 'dispersions_norm', float vector (adata.var)" ) - adata.var["highly_variable"] = df["highly_variable"].values - adata.var["means"] = df["means"].values - adata.var["dispersions"] = df["dispersions"].values - adata.var["dispersions_norm"] = df["dispersions_norm"].values.astype( - "float32", copy=False + adata.var["highly_variable"] = df["highly_variable"].to_numpy() + adata.var["means"] = df["means"].to_numpy() + adata.var["dispersions"] = df["dispersions"].to_numpy() + adata.var["dispersions_norm"] = ( + df["dispersions_norm"].to_numpy().astype("float32", copy=False) ) if batch_key is not None: adata.var["highly_variable_nbatches"] = df[ "highly_variable_nbatches" - ].values + ].to_numpy() adata.var["highly_variable_intersection"] = df[ "highly_variable_intersection" - ].values + ].to_numpy() if subset: - adata._inplace_subset_var(df["highly_variable"].values) + adata._inplace_subset_var(df["highly_variable"].to_numpy()) else: if subset: - df = df.iloc[df.highly_variable.values, :] + df = df.loc[df["highly_variable"]] return df diff --git a/scanpy/testing/_pytest/fixtures/data.py b/scanpy/testing/_pytest/fixtures/data.py index 58523e9208..ec62ae7d74 100644 --- a/scanpy/testing/_pytest/fixtures/data.py +++ b/scanpy/testing/_pytest/fixtures/data.py @@ -66,7 +66,7 @@ def _prepare_pbmc_testdata( import scanpy as sc if small: - adata = adata[:1000, :500] + adata = adata[:1000, :500].copy() sc.pp.filter_cells(adata, min_genes=1) np.random.seed(42) adata.obs["batch"] = np.random.randint(0, 3, size=adata.shape[0]) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 9320753cdd..0ef36643cc 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from scipy import sparse import scanpy as sc from scanpy.testing._helpers import _check_check_values_warnings @@ -16,32 +17,39 @@ FILE_V3_BATCH = Path(__file__).parent / Path("_scripts/seurat_hvg_v3_batch.csv") -def test_highly_variable_genes_basic(): +def test_highly_variable_genes_runs(): adata = sc.datasets.blobs() sc.pp.highly_variable_genes(adata) + +def test_highly_variable_genes_supports_batch(): adata = sc.datasets.blobs() - np.random.seed(0) - adata.obs["batch"] = np.random.binomial(3, 0.5, size=(adata.n_obs)) - adata.obs["batch"] = adata.obs["batch"].astype("category") + gen = np.random.default_rng(0) + adata.obs["batch"] = pd.array( + gen.binomial(3, 0.5, size=adata.n_obs), dtype="category" + ) sc.pp.highly_variable_genes(adata, batch_key="batch") assert "highly_variable_nbatches" in adata.var.columns assert "highly_variable_intersection" in adata.var.columns + +def test_highly_variable_genes_supports_layers(): adata = sc.datasets.blobs() - batch = np.random.binomial(4, 0.5, size=(adata.n_obs)) - adata.obs["batch"] = batch - adata.obs["batch"] = adata.obs["batch"].astype("category") + gen = np.random.default_rng(0) + adata.obs["batch"] = pd.array( + gen.binomial(4, 0.5, size=adata.n_obs), dtype="category" + ) sc.pp.highly_variable_genes(adata, batch_key="batch", n_top_genes=3) assert "highly_variable_nbatches" in adata.var.columns assert adata.var["highly_variable"].sum() == 3 highly_var_first_layer = adata.var["highly_variable"] adata = sc.datasets.blobs() + assert isinstance(adata.X, np.ndarray) new_layer = adata.X.copy() - np.random.shuffle(new_layer) + gen.shuffle(new_layer) adata.layers["test_layer"] = new_layer - adata.obs["batch"] = batch + adata.obs["batch"] = gen.binomial(4, 0.5, size=(adata.n_obs)) adata.obs["batch"] = adata.obs["batch"].astype("category") sc.pp.highly_variable_genes( adata, batch_key="batch", n_top_genes=3, layer="test_layer" @@ -50,15 +58,23 @@ def test_highly_variable_genes_basic(): assert adata.var["highly_variable"].sum() == 3 assert (highly_var_first_layer != adata.var["highly_variable"]).any() + +def test_highly_variable_genes_no_batch_matches_batch(): + adata = sc.datasets.blobs() sc.pp.highly_variable_genes(adata) - no_batch_hvg = adata.var.highly_variable.copy() + no_batch_hvg = adata.var["highly_variable"].copy() assert no_batch_hvg.any() adata.obs["batch"] = "batch" adata.obs["batch"] = adata.obs["batch"].astype("category") sc.pp.highly_variable_genes(adata, batch_key="batch") - assert np.all(no_batch_hvg == adata.var.highly_variable) - assert np.all(adata.var.highly_variable_intersection == adata.var.highly_variable) + assert np.all(no_batch_hvg == adata.var["highly_variable"]) + assert np.all( + adata.var["highly_variable_intersection"] == adata.var["highly_variable"] + ) + +def test_highly_variable_genes_(): + adata = sc.datasets.blobs() adata.obs["batch"] = np.tile(["a", "b"], adata.shape[0] // 2) sc.pp.highly_variable_genes(adata, batch_key="batch") assert adata.var["highly_variable"].any() @@ -72,6 +88,7 @@ def test_highly_variable_genes_basic(): "highly_variable", ] hvg_df = sc.pp.highly_variable_genes(adata, batch_key="batch", inplace=False) + assert hvg_df is not None assert np.all(np.isin(colnames, hvg_df.columns)) @@ -83,6 +100,7 @@ def test_highly_variable_genes_keep_layer(base, flavor): sc.pp.filter_genes(adata, min_counts=1) sc.pp.log1p(adata, base=base) + assert isinstance(adata.X, sparse.csr_matrix) X_orig = adata.X.copy() if flavor == "seurat": @@ -95,13 +113,13 @@ def test_highly_variable_genes_keep_layer(base, flavor): assert np.allclose(X_orig.A, adata.X.A) -def _check_pearson_hvg_columns(output_df, n_top_genes): +def _check_pearson_hvg_columns(output_df: pd.DataFrame, n_top_genes: int): assert pd.api.types.is_float_dtype(output_df["residual_variances"].dtype) - assert output_df["highly_variable"].values.dtype is np.dtype("bool") + assert output_df["highly_variable"].to_numpy().dtype is np.dtype("bool") assert np.sum(output_df["highly_variable"]) == n_top_genes - assert np.nanmax(output_df["highly_variable_rank"].values) <= n_top_genes - 1 + assert np.nanmax(output_df["highly_variable_rank"].to_numpy()) <= n_top_genes - 1 def test_highly_variable_genes_pearson_residuals_inputchecks(pbmc3k_parametrized_small): @@ -138,10 +156,12 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(pbmc3k_parametrized ) -@pytest.mark.parametrize("subset", [True, False]) -@pytest.mark.parametrize("clip", [None, np.Inf, 30]) -@pytest.mark.parametrize("theta", [100, np.Inf]) -@pytest.mark.parametrize("n_top_genes", [100, 200]) +@pytest.mark.parametrize("subset", [True, False], ids=["subset", "full"]) +@pytest.mark.parametrize( + "clip", [None, np.Inf, 30], ids=["noclip", "infclip", "30clip"] +) +@pytest.mark.parametrize("theta", [100, np.Inf], ids=["100theta", "inftheta"]) +@pytest.mark.parametrize("n_top_genes", [100, 200], ids=["100n", "200n"]) def test_highly_variable_genes_pearson_residuals_general( pbmc3k_parametrized_small, subset, clip, theta, n_top_genes ): @@ -150,10 +170,11 @@ def test_highly_variable_genes_pearson_residuals_general( del adata.var # compute reference output - residuals_reference = sc.experimental.pp.normalize_pearson_residuals( + residuals_res = sc.experimental.pp.normalize_pearson_residuals( adata, clip=clip, theta=theta, inplace=False - )["X"] - residual_variances_reference = np.var(residuals_reference, axis=0) + ) + assert isinstance(residuals_res, dict) + residual_variances_reference = np.var(residuals_res["X"], axis=0) if subset: # lazyly sort by residual variance and take top N @@ -171,6 +192,7 @@ def test_highly_variable_genes_pearson_residuals_general( clip=clip, theta=theta, ) + assert output_df is not None sc.experimental.pp.highly_variable_genes( adata, @@ -198,32 +220,32 @@ def test_highly_variable_genes_pearson_residuals_general( # check consistency with normalization method if subset: # sort values before comparing as reference is sorted as well for subset case - sort_output_idx = np.argsort(-output_df["residual_variances"].values) + sort_output_idx = np.argsort(-output_df["residual_variances"].to_numpy()) assert np.allclose( - output_df["residual_variances"].values[sort_output_idx], + output_df["residual_variances"].to_numpy()[sort_output_idx], residual_variances_reference, ) else: assert np.allclose( - output_df["residual_variances"].values, residual_variances_reference + output_df["residual_variances"].to_numpy(), residual_variances_reference ) # check hvg flag hvg_idx = np.where(output_df["highly_variable"])[0] topn_idx = np.sort( - np.argsort(-output_df["residual_variances"].values)[:n_top_genes] + np.argsort(-output_df["residual_variances"].to_numpy())[:n_top_genes] ) assert np.all(hvg_idx == topn_idx) # check ranks - assert np.nanmin(output_df["highly_variable_rank"].values) == 0 + assert np.nanmin(output_df["highly_variable_rank"].to_numpy()) == 0 # more general checks on ranks, hvg flag and residual variance _check_pearson_hvg_columns(output_df, n_top_genes) -@pytest.mark.parametrize("subset", [True, False]) -@pytest.mark.parametrize("n_top_genes", [100, 200]) +@pytest.mark.parametrize("subset", [True, False], ids=["subset", "full"]) +@pytest.mark.parametrize("n_top_genes", [100, 200], ids=["100n", "200n"]) def test_highly_variable_genes_pearson_residuals_batch( pbmc3k_parametrized_small, subset, n_top_genes ): @@ -240,6 +262,7 @@ def test_highly_variable_genes_pearson_residuals_batch( subset=subset, inplace=False, ) + assert output_df is not None sc.experimental.pp.highly_variable_genes( adata, @@ -270,7 +293,9 @@ def test_highly_variable_genes_pearson_residuals_batch( # check intersection flag nbatches = len(np.unique(adata.obs["batch"])) - assert output_df["highly_variable_intersection"].values.dtype is np.dtype("bool") + assert output_df["highly_variable_intersection"].to_numpy().dtype is np.dtype( + "bool" + ) assert np.sum(output_df["highly_variable_intersection"]) <= n_top_genes * nbatches assert np.all(output_df["highly_variable"][output_df.highly_variable_intersection]) @@ -278,9 +303,9 @@ def test_highly_variable_genes_pearson_residuals_batch( assert pd.api.types.is_float_dtype(output_df["highly_variable_rank"].dtype) # check nbatches - assert output_df["highly_variable_nbatches"].values.dtype is np.dtype("int") - assert np.min(output_df["highly_variable_nbatches"].values) >= 0 - assert np.max(output_df["highly_variable_nbatches"].values) <= nbatches + assert output_df["highly_variable_nbatches"].to_numpy().dtype is np.dtype("int") + assert np.min(output_df["highly_variable_nbatches"].to_numpy()) >= 0 + assert np.max(output_df["highly_variable_nbatches"].to_numpy()) <= nbatches # check subsetting if subset: @@ -289,7 +314,7 @@ def test_highly_variable_genes_pearson_residuals_batch( assert len(output_df) == n_genes -def test_higly_variable_genes_compare_to_seurat(): +def test_highly_variable_genes_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=" ") pbmc = pbmc68k_reduced() @@ -329,7 +354,7 @@ def test_higly_variable_genes_compare_to_seurat(): @needs.skmisc -def test_higly_variable_genes_compare_to_seurat_v3(): +def test_highly_variable_genes_compare_to_seurat_v3(): seurat_hvg_info = pd.read_csv( FILE_V3, sep=" ", dtype={"variances_norm": np.float64} ) @@ -371,6 +396,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): df = sc.pp.highly_variable_genes( pbmc, n_top_genes=4000, flavor="seurat_v3", batch_key="batch", inplace=False ) + assert df is not None df.sort_values( ["highly_variable_nbatches", "highly_variable_rank"], ascending=[False, True], @@ -383,12 +409,12 @@ def test_higly_variable_genes_compare_to_seurat_v3(): ) # ranks might be slightly different due to many genes having same normalized var - seu = pd.Index(seurat_hvg_info_batch["x"].values) + seu = pd.Index(seurat_hvg_info_batch["x"].to_numpy()) assert len(seu.intersection(df.index)) / 4000 > 0.95 @needs.skmisc -def test_higly_variable_genes_seurat_v3_warning(): +def test_highly_variable_genes_seurat_v3_warning(): pbmc = pbmc3k()[:200].copy() sc.pp.log1p(pbmc) with pytest.warns( @@ -462,9 +488,11 @@ def test_highly_variable_genes_batches(): hvg1 = sc.pp.highly_variable_genes( adata_1, flavor="cell_ranger", n_top_genes=200, inplace=False ) + assert hvg1 is not None hvg2 = sc.pp.highly_variable_genes( adata_2, flavor="cell_ranger", n_top_genes=200, inplace=False ) + assert hvg2 is not None np.testing.assert_allclose( adata.var["dispersions_norm"].iat[100], @@ -529,13 +557,9 @@ def test_cellranger_n_top_genes_warning(): @pytest.mark.parametrize("flavor", ["seurat", "cell_ranger"]) -@pytest.mark.parametrize("subset", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_highly_variable_genes_subset_inplace_consistency( - flavor, - subset, - inplace, -): +@pytest.mark.parametrize("subset", [True, False], ids=["subset", "full"]) +@pytest.mark.parametrize("inplace", [True, False], ids=["inplace", "copy"]) +def test_highly_variable_genes_subset_inplace_consistency(flavor, subset, inplace): adata = sc.datasets.blobs(n_observations=20, n_variables=80, random_state=0) adata.X = np.abs(adata.X).astype(int)