From 81daa092e0fbb741d1144df8c93deeadf3355ba9 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Thu, 18 Mar 2021 10:14:27 +0100 Subject: [PATCH 1/3] Add pre-commit changes --- .flake8 | 37 +++++++++++ .pre-commit-config.yaml | 9 +++ docs/dev/code.rst | 7 ++- scanpy/_utils.py | 6 +- scanpy/external/pl.py | 14 ++--- scanpy/external/pp/_scrublet.py | 62 +++++++++---------- scanpy/external/tl/_trimap.py | 2 +- scanpy/get/get.py | 6 +- scanpy/neighbors/__init__.py | 34 +++++----- scanpy/plotting/__init__.py | 2 +- scanpy/plotting/_anndata.py | 6 +- scanpy/plotting/_baseplot_class.py | 4 +- scanpy/plotting/_matrixplot.py | 2 +- scanpy/plotting/_tools/__init__.py | 6 +- scanpy/plotting/_tools/paga.py | 6 +- scanpy/plotting/_tools/scatterplots.py | 8 +-- scanpy/plotting/_utils.py | 14 ++--- scanpy/plotting/palettes.py | 4 +- .../_deprecated/highly_variable_genes.py | 3 +- .../preprocessing/_highly_variable_genes.py | 4 +- scanpy/preprocessing/_pca.py | 2 +- scanpy/preprocessing/_simple.py | 8 ++- scanpy/readwrite.py | 8 +-- scanpy/tests/conftest.py | 1 + scanpy/tests/external/test_scrublet.py | 6 +- scanpy/tests/test_embedding_plots.py | 2 +- scanpy/tests/test_logging.py | 38 ++++++------ scanpy/tests/test_pca.py | 2 - scanpy/tests/test_plotting.py | 6 +- .../tests/test_preprocessing_distributed.py | 6 +- scanpy/tests/test_scaling.py | 4 +- scanpy/tools/_diffmap.py | 7 +++ scanpy/tools/_dpt.py | 14 +++-- scanpy/tools/_paga.py | 1 - scanpy/tools/_sim.py | 29 +++++---- scanpy/tools/_top_genes.py | 33 +++++----- 36 files changed, 231 insertions(+), 172 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..63a2344286 --- /dev/null +++ b/.flake8 @@ -0,0 +1,37 @@ +# Can't yet be moved to the pyproject.toml due to https://gitlab.com/pycqa/flake8/-/issues/428#note_251982786 +[flake8] +max-line-length = 88 +ignore = # module imported but unused -> required for Scanpys API + F401, + # line break before a binary operator -> black does not adhere to PEP8 + W503, + # line break occured after a binary operator -> black does not adhere to PEP8 + W504, + # line too long -> we accept long comment lines; black gets rid of long code lines + E501, + # whitespace before : -> black does not adhere to PEP8 + E203, + # missing whitespace after ,', ';', or ':' -> black does not adhere to PEP8 + E231, + # module level import not at top of file -> required to circumvent circular imports for Scanpys API + E402, + # continuation line over-indented for hanging indent -> black does not adhere to PEP8 + E126, + # E266 too many leading '#' for block comment -> Scanpy allows them for comments into sections + E262, + # inline comment should start with '# ' -> Scanpy allows them for specific explanations + E266, + # Do not assign a lambda expression, use a def -> Scanpy allows lambda expression assignments, + E731, + # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation + E741 + per-file-ignores = + # F811 Redefinition of unused name from line, does not play nice with pytest fixtures + tests/test*.py: F811 +exclude = + .git, + __pycache__, + build, + docs/_build, + dist, + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 93ae9e8fd7..95a96ae781 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,3 +3,12 @@ repos: rev: 20.8b1 hooks: - id: black +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.4 + hooks: + - id: flake8 +- repo: https://github.com/pre-commit/mirrors-autopep8 + rev: v1.5.5 + hooks: + - id: autopep8 + args: ["-i"] diff --git a/docs/dev/code.rst b/docs/dev/code.rst index 218e73b4ae..4fc0a474f2 100644 --- a/docs/dev/code.rst +++ b/docs/dev/code.rst @@ -17,6 +17,11 @@ Code style New code should follow `Black `__ -and Scanpy’s +and +`flake8 `__. +We ignore a couple of flake8 checks which are documented in the .flake8 file in the root of this repository. +To learn how to ignore checks per line please read +`flake8 violations `__. +Additionally, we use Scanpy’s `EditorConfig `__, so using an editor/IDE with support for both is helpful. diff --git a/scanpy/_utils.py b/scanpy/_utils.py index 207f40f0b6..1aac2e925b 100644 --- a/scanpy/_utils.py +++ b/scanpy/_utils.py @@ -209,7 +209,7 @@ def get_igraph_from_adjacency(adjacency, directed=None): g.add_edges(list(zip(sources, targets))) try: g.es['weight'] = weights - except: + except KeyError: pass if g.vcount() != adjacency.shape[0]: logg.warning( @@ -551,7 +551,9 @@ def warn_with_traceback(message, category, filename, lineno, file=None, line=Non import traceback traceback.print_stack() - log = file if hasattr(file, 'write') else sys.stderr + log = ( # noqa: F841 # TODO Does this need fixing? + file if hasattr(file, 'write') else sys.stderr + ) settings.write(warnings.formatwarning(message, category, filename, lineno, line)) diff --git a/scanpy/external/pl.py b/scanpy/external/pl.py index b35310591e..3a59e99b61 100644 --- a/scanpy/external/pl.py +++ b/scanpy/external/pl.py @@ -332,15 +332,15 @@ def scrublet_score_distribution( figsize: Optional[Tuple[float, float]] = (8, 3), ): """\ - Plot histogram of doublet scores for observed transcriptomes and simulated doublets. + Plot histogram of doublet scores for observed transcriptomes and simulated doublets. + + The histogram for simulated doublets is useful for determining the correct doublet + score threshold. - The histogram for simulated doublets is useful for determining the correct doublet - score threshold. - Parameters ---------- adata - An annData object resulting from func:`~scanpy.external.scrublet`. + An annData object resulting from func:`~scanpy.external.scrublet`. scale_hist_obs Set y axis scale transformation in matplotlib for the plot of observed transcriptomes (e.g. "linear", "log", "symlog", "logit") @@ -353,9 +353,9 @@ def scrublet_score_distribution( See also -------- :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs - preprocessing, doublet simulation (this function) and calling. + preprocessing, doublet simulation (this function) and calling. :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet - simulation separately for advanced usage. + simulation separately for advanced usage. """ threshold = adata.uns['scrublet']['threshold'] diff --git a/scanpy/external/pp/_scrublet.py b/scanpy/external/pp/_scrublet.py index 374f5b4fe9..6611fccac5 100644 --- a/scanpy/external/pp/_scrublet.py +++ b/scanpy/external/pp/_scrublet.py @@ -1,5 +1,5 @@ from anndata import AnnData -from typing import Collection, Tuple, Optional, Union +from typing import Optional import numpy as np from scipy import sparse @@ -40,7 +40,7 @@ def scrublet( and directly call functions of Scrublet(). You may also undertake your own preprocessing, simulate doublets with scanpy.external.pp.scrublet_simulate_doublets(), and run the core scrublet - function scanpy.external.pp.scrublet.scrublet(). + function scanpy.external.pp.scrublet.scrublet(). .. note:: More information and bug reports `here @@ -61,7 +61,7 @@ def scrublet( as adata. This should have been built from adata_obs after filtering genes and cells and selcting highly-variable genes. sim_doublet_ratio - Number of doublets to simulate relative to the number of observed + Number of doublets to simulate relative to the number of observed transcriptomes. expected_doublet_rate Where adata_sim not suplied, the estimated doublet rate for the @@ -73,8 +73,8 @@ def scrublet( synthetic doublets. If 1.0, each doublet is created by simply adding the UMI counts from two randomly sampled observed transcriptomes. For values less than 1, the UMI counts are added and then randomly sampled - at the specified rate. - knn_dist_metric + at the specified rate. + knn_dist_metric Distance metric used when finding nearest neighbors. For list of valid values, see the documentation for annoy (if `use_approx_neighbors` is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors` @@ -90,16 +90,16 @@ def scrublet( If True, center the data such that each gene has a mean of 0. `sklearn.decomposition.PCA` will be used for dimensionality reduction. - n_prin_comps + n_prin_comps Number of principal components used to embed the transcriptomes prior - to k-nearest-neighbor graph construction. + to k-nearest-neighbor graph construction. use_approx_neighbors - Use approximate nearest neighbor method (annoy) for the KNN + Use approximate nearest neighbor method (annoy) for the KNN classifier. get_doublet_neighbor_parents If True, return (in .uns) the parent transcriptomes that generated the doublet neighbors of each observed transcriptome. This information can - be used to infer the cell states that generated a given doublet state. + be used to infer the cell states that generated a given doublet state. n_neighbors Number of neighbors used to construct the KNN graph of observed transcriptomes and simulated doublets. If ``None``, this is @@ -133,7 +133,7 @@ def scrublet( ``adata.uns['scrublet']['doublet_scores_sim']`` Doublet scores for each simulated doublet transcriptome - ``adata.uns['scrublet']['doublet_parents']`` + ``adata.uns['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome @@ -143,9 +143,9 @@ def scrublet( See also -------- :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet - simulation separately for advanced usage. + simulation separately for advanced usage. :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet - scores for observed transcriptomes and simulated doublets. + scores for observed transcriptomes and simulated doublets. """ try: import scrublet as sl @@ -185,7 +185,7 @@ def scrublet( pp.highly_variable_genes(adata_obs, subset=True) else: logged = pp.log1p(adata_obs, copy=True) - hvg = pp.highly_variable_genes(logged) + _ = pp.highly_variable_genes(logged) adata_obs = adata_obs[:, logged.var['highly_variable']] # Simulate the doublets based on the raw expressions from the normalised @@ -257,7 +257,7 @@ def _scrublet_call_doublets( transcriptomes and simulated doublets. This is a wrapper around the core functions of `Scrublet `__ to allow for flexibility in applying Scanpy filtering operations upstream. Unless - you know what you're doing you should use the main scrublet() function. + you know what you're doing you should use the main scrublet() function. .. note:: More information and bug reports `here @@ -293,9 +293,9 @@ def _scrublet_call_doublets( reduction, unless `mean_center` is True. n_prin_comps Number of principal components used to embed the transcriptomes prior - to k-nearest-neighbor graph construction. + to k-nearest-neighbor graph construction. use_approx_neighbors - Use approximate nearest neighbor method (annoy) for the KNN + Use approximate nearest neighbor method (annoy) for the KNN classifier. knn_dist_metric Distance metric used when finding nearest neighbors. For list of @@ -303,10 +303,10 @@ def _scrublet_call_doublets( is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors` is False). get_doublet_neighbor_parents - If True, return the parent transcriptomes that generated the - doublet neighbors of each observed transcriptome. This information can - be used to infer the cell states that generated a given - doublet state. + If True, return the parent transcriptomes that generated the + doublet neighbors of each observed transcriptome. This information can + be used to infer the cell states that generated a given + doublet state. threshold Doublet score threshold for calling a transcriptome a doublet. If `None`, this is set automatically by looking for the minimum between @@ -316,7 +316,7 @@ def _scrublet_call_doublets( predicted doublets in a 2-D embedding. random_state Initial state for doublet simulation and nearest neighbors. - verbose + verbose If True, print progress updates. Returns @@ -333,7 +333,7 @@ def _scrublet_call_doublets( ``adata.uns['scrublet']['doublet_scores_sim']`` Doublet scores for each simulated doublet transcriptome - ``adata.uns['scrublet']['doublet_parents']`` + ``adata.uns['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome ``uns['scrublet']['parameters']`` @@ -453,16 +453,16 @@ def scrublet_simulate_doublets( The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. Genes should have been filtered for expression and variability, and the object should contain - raw expression of the same dimensions. + raw expression of the same dimensions. layer - Layer of adata where raw values are stored, or 'X' if values are in .X. + Layer of adata where raw values are stored, or 'X' if values are in .X. sim_doublet_ratio - Number of doublets to simulate relative to the number of observed + Number of doublets to simulate relative to the number of observed transcriptomes. If `None`, self.sim_doublet_ratio is used. synthetic_doublet_umi_subsampling - Rate for sampling UMIs when creating synthetic doublets. If 1.0, - each doublet is created by simply adding the UMIs from two randomly - sampled observed transcriptomes. For values less than 1, the + Rate for sampling UMIs when creating synthetic doublets. If 1.0, + each doublet is created by simply adding the UMIs from two randomly + sampled observed transcriptomes. For values less than 1, the UMI counts are added and then randomly sampled at the specified rate. @@ -471,7 +471,7 @@ def scrublet_simulate_doublets( adata : anndata.AnnData with simulated doublets in .X if ``copy=True`` it returns or else adds fields to ``adata``: - ``adata.uns['scrublet']['doublet_parents']`` + ``adata.uns['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome ``uns['scrublet']['parameters']`` @@ -480,9 +480,9 @@ def scrublet_simulate_doublets( See also -------- :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs - preprocessing, doublet simulation (this function) and calling. + preprocessing, doublet simulation (this function) and calling. :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet - scores for observed transcriptomes and simulated doublets. + scores for observed transcriptomes and simulated doublets. """ try: import scrublet as sl diff --git a/scanpy/external/tl/_trimap.py b/scanpy/external/tl/_trimap.py index 5935d81f5a..5334e29fda 100644 --- a/scanpy/external/tl/_trimap.py +++ b/scanpy/external/tl/_trimap.py @@ -76,7 +76,7 @@ def trimap( Example ------- - + >>> import scanpy as sc >>> import scanpy.external as sce >>> pbmc = sc.datasets.pbmc68k_reduced() diff --git a/scanpy/get/get.py b/scanpy/get/get.py index 45c081687e..a90b84b70c 100644 --- a/scanpy/get/get.py +++ b/scanpy/get/get.py @@ -6,7 +6,7 @@ from scipy.sparse import spmatrix from anndata import AnnData -import warnings +from .._compat import Literal # -------------------------------------------------------------------------------- # Plotting data helpers @@ -96,7 +96,7 @@ def rank_genes_groups_df( def _check_indices( dim_df: pd.DataFrame, alt_index: pd.Index, - dim: "Literal['obs', 'var']", + dim: Literal['obs', 'var'], keys: List[str], alias_index: Optional[pd.Index] = None, use_raw: bool = False, @@ -176,7 +176,7 @@ def _get_array_values( X, dim_names: pd.Index, keys: List[str], - axis: "Literal[0, 1]", + axis: Literal[0, 1], backed: bool, ): # TODO: This should be made easier on the anndata side diff --git a/scanpy/neighbors/__init__.py b/scanpy/neighbors/__init__.py index 3e6d09c465..4caebbee8a 100644 --- a/scanpy/neighbors/__init__.py +++ b/scanpy/neighbors/__init__.py @@ -15,13 +15,11 @@ from ..tools._utils import _choose_representation, doc_use_rep, doc_n_pcs from .. import settings - N_DCS = 15 # default number of diffusion components N_PCS = ( settings.N_PCS ) # Backwards compat, constants should be defined in only one place. - _Method = Literal['umap', 'gauss', 'rapids'] _MetricFn = Callable[[np.ndarray, np.ndarray], float] # from sklearn.metrics.pairwise_distances.__doc__: @@ -126,6 +124,12 @@ def neighbors( **distances** : sparse matrix of dtype `float32`. Instead of decaying weights, this stores distances for each pair of neighbors. + + Notes + ----- + If `method='umap'`, it's highly recommended to install pynndescent ``pip install pynndescent``. + Installing `pynndescent` can significantly increase performance, + and in later versions it will become a hard dependency. """ start = logg.info('computing neighbors') adata = adata.copy() if copy else adata @@ -765,9 +769,7 @@ def compute_neighbors( self.knn = knn X = _choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs) # neighbor search - use_dense_distances = ( - metric == 'euclidean' and X.shape[0] < 8192 - ) or knn == False + use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or not knn if use_dense_distances: _distances = pairwise_distances(X, metric=metric, **metric_kwds) knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( @@ -793,7 +795,7 @@ def compute_neighbors( try: if forest: self._rp_forest = _make_forest_dict(forest) - except: + except Exception: # TODO catch the correct exception pass # write indices as attributes if write_knn_indices: @@ -858,7 +860,7 @@ def _compute_connectivities_diffmap(self, density_normalize=True): # make the weight matrix sparse if not self.knn: mask = W > 1e-14 - W[mask == False] = 0 + W[~mask] = 0 else: # restrict number of neighbors to ~k # build a symmetric mask @@ -870,7 +872,7 @@ def _compute_connectivities_diffmap(self, density_normalize=True): W[j, i] = W[i, j] mask[j, i] = True # set all entries that are not nearest neighbors to zero - W[mask == False] = 0 + W[~mask] = 0 else: W = ( Dsq.copy() @@ -1021,14 +1023,14 @@ def _get_dpt_row(self, i): mask = self._connected_components[1] == label row = sum( ( - self.eigen_values[l] - / (1 - self.eigen_values[l]) - * (self.eigen_basis[i, l] - self.eigen_basis[:, l]) + self.eigen_values[j] + / (1 - self.eigen_values[j]) + * (self.eigen_basis[i, j] - self.eigen_basis[:, j]) ) ** 2 # account for float32 precision - for l in range(0, self.eigen_values.size) - if self.eigen_values[l] < 0.9994 + for j in range(0, self.eigen_values.size) + if self.eigen_values[j] < 0.9994 ) # thanks to Marius Lange for pointing Alex to this: # we will likely remove the contributions from the stationary state below when making @@ -1036,9 +1038,9 @@ def _get_dpt_row(self, i): # they never seem to have deteriorated results, but also other distance measures (see e.g. # PAGA paper) don't have it, which makes sense row += sum( - (self.eigen_basis[i, l] - self.eigen_basis[:, l]) ** 2 - for l in range(0, self.eigen_values.size) - if self.eigen_values[l] >= 0.9994 + (self.eigen_basis[i, k] - self.eigen_basis[:, k]) ** 2 + for k in range(0, self.eigen_values.size) + if self.eigen_values[k] >= 0.9994 ) if mask is not None: row[~mask] = np.inf diff --git a/scanpy/plotting/__init__.py b/scanpy/plotting/__init__.py index 4ce775e44f..ef3aaecede 100644 --- a/scanpy/plotting/__init__.py +++ b/scanpy/plotting/__init__.py @@ -77,7 +77,7 @@ Classes ------- -These classes allow fine tuning of visual parameters. +These classes allow fine tuning of visual parameters. .. autosummary:: :toctree: . diff --git a/scanpy/plotting/_anndata.py b/scanpy/plotting/_anndata.py index 9e56069759..0ac8ba0e9b 100755 --- a/scanpy/plotting/_anndata.py +++ b/scanpy/plotting/_anndata.py @@ -552,7 +552,7 @@ def ranking( n_rows, n_cols = 1, n_panels else: n_rows, n_cols = 2, int(n_panels / 2 + 0.5) - fig = pl.figure( + _ = pl.figure( figsize=( n_cols * rcParams['figure.figsize'][0], n_rows * rcParams['figure.figsize'][1], @@ -1474,7 +1474,7 @@ def tracksplot( ymin, ymax = ax.get_ylim() ymax = int(ymax) ax.set_yticks([ymax]) - tt = ax.set_yticklabels([str(ymax)], ha='left', va='top') + ax.set_yticklabels([str(ymax)], ha='left', va='top') ax.spines['right'].set_position(('axes', 1.01)) ax.tick_params( axis='y', @@ -1972,7 +1972,7 @@ def _plot_gene_groups_brackets( va='bottom', rotation=rotation, ) - except: + except Exception: # TODO catch the correct exception pass else: top = left diff --git a/scanpy/plotting/_baseplot_class.py b/scanpy/plotting/_baseplot_class.py index 38a1a76da1..f8edc819dd 100644 --- a/scanpy/plotting/_baseplot_class.py +++ b/scanpy/plotting/_baseplot_class.py @@ -125,9 +125,9 @@ def __init__( "the `order` parameter match the categories that " "want to be reordered.\n\n" "Mismatch: " - f"{set(obs_tidy.index.categories).difference(categories_order)}\n\n" + f"{set(self.obs_tidy.index.categories).difference(categories_order)}\n\n" f"Given order categories: {categories_order}\n\n" - f"{groupby} categories: {list(obs_tidy.index.categories)}\n" + f"{groupby} categories: {list(self.obs_tidy.index.categories)}\n" ) return diff --git a/scanpy/plotting/_matrixplot.py b/scanpy/plotting/_matrixplot.py index 6a225a440c..d95678ba06 100644 --- a/scanpy/plotting/_matrixplot.py +++ b/scanpy/plotting/_matrixplot.py @@ -219,7 +219,7 @@ def _mainplot(self, ax): linewidth=self.edge_lw, norm=normalize, ) - __ = ax.pcolor(_color_df, **kwds) + _ = ax.pcolor(_color_df, **kwds) y_labels = _color_df.index x_labels = _color_df.columns diff --git a/scanpy/plotting/_tools/__init__.py b/scanpy/plotting/_tools/__init__.py index 7ff590a33a..c0d0723df7 100644 --- a/scanpy/plotting/_tools/__init__.py +++ b/scanpy/plotting/_tools/__init__.py @@ -55,7 +55,7 @@ def pca_overview(adata: AnnData, **params): show = params['show'] if 'show' in params else None if 'show' in params: del params['show'] - scatterplots.pca(adata, **params, show=False) + pca(adata, **params, show=False) pca_loadings(adata, show=False) pca_variance_ratio(adata, show=show) @@ -358,7 +358,7 @@ def _fig_show_save_or_axes(plot_obj, return_fig, show, save): plot_obj.make_figure() savefig_or_show(plot_obj.DEFAULT_SAVE_PREFIX, show=show, save=save) show = settings.autoshow if show is None else show - if not show: + if show is False: return plot_obj.get_axes() @@ -960,7 +960,7 @@ def rank_genes_groups_violin( ) savefig_or_show(writekey, show=show, save=save) axs.append(_ax) - if show == False: + if show is False: return axs diff --git a/scanpy/plotting/_tools/paga.py b/scanpy/plotting/_tools/paga.py index f5a0cfffe8..7eda05fe8f 100644 --- a/scanpy/plotting/_tools/paga.py +++ b/scanpy/plotting/_tools/paga.py @@ -147,7 +147,7 @@ def paga_compare( if suptitle is not None: pl.suptitle(suptitle) _utils.savefig_or_show('paga_compare', show=show, save=save) - if show == False: + if show is False: return axs @@ -168,7 +168,7 @@ def _compute_pos( if layout == 'fa': try: from fa2 import ForceAtlas2 - except: + except ImportError: logg.warning( "Package 'fa2' is not installed, falling back to layout 'fr'." 'To use the faster and better ForceAtlas2 layout, ' @@ -578,7 +578,7 @@ def is_flat(x): else: ax_cb = cax[icolor] - cb = pl.colorbar( + _ = pl.colorbar( sct, format=ticker.FuncFormatter(_utils.ticks_formatter), cax=ax_cb, diff --git a/scanpy/plotting/_tools/scatterplots.py b/scanpy/plotting/_tools/scatterplots.py index 8772a6fb82..52d290d150 100644 --- a/scanpy/plotting/_tools/scatterplots.py +++ b/scanpy/plotting/_tools/scatterplots.py @@ -220,7 +220,6 @@ def embedding( else: size = 120000 / adata.shape[0] - ### # make the plots axs = [] import itertools @@ -252,7 +251,7 @@ def embedding( na_color=na_color, ) - ### Order points + # Order points order = slice(None) if sort_order is True and value_to_plot is not None and categorical is False: # Higher values plotted on top, null values on bottom @@ -470,7 +469,6 @@ def _get_vmin_vmax( index: int, color_vector: Sequence[float], ) -> Tuple[Union[float, None], Union[float, None]]: - """ Evaluates the value of vmin and vmax, which could be a str in which case is interpreted as a percentile and should @@ -730,7 +728,7 @@ def pca( ) else: - if 'pca' not in adata.obsm.keys() and f"X_pca" not in adata.obsm.keys(): + if 'pca' not in adata.obsm.keys() and 'X_pca' not in adata.obsm.keys(): raise KeyError( f"Could not find entry in `obsm` for 'pca'.\n" f"Available keys are: {list(adata.obsm.keys())}." @@ -977,7 +975,7 @@ def _get_data_points( data_points = [] for comp in components_list: data_points.append(adata.obsm[basis_key][:, comp]) - except: + except Exception: # TODO catch the correct exception raise ValueError( "Given components: '{}' are not valid. Please check. " "A valid example is `components='2,3'`" diff --git a/scanpy/plotting/_utils.py b/scanpy/plotting/_utils.py index f6cb8886ba..29f55d28d3 100644 --- a/scanpy/plotting/_utils.py +++ b/scanpy/plotting/_utils.py @@ -126,7 +126,7 @@ def timeseries_subplot( else: levels, _ = np.unique(color, return_inverse=True) colors = np.array(palette[: len(levels)].by_key()['color']) - subsets = [(x_range[color == l], X[color == l, :]) for l in levels] + subsets = [(x_range[color == level], X[color == level, :]) for level in levels] if ax is None: ax = pl.subplot() @@ -619,7 +619,9 @@ def setup_axes( figure_width = width_without_offsets + left_offset + right_offset draw_region_width_frac = draw_region_width / figure_width left_offset_frac = left_offset / figure_width - right_offset_frac = 1 - (len(panels) - 1) * left_offset_frac + right_offset_frac = ( # noqa: F841 # TODO Does this need fixing? + 1 - (len(panels) - 1) * left_offset_frac + ) if ax is None: pl.figure( @@ -707,9 +709,7 @@ def scatter_base( ) for icolor, color in enumerate(colors): ax = axs[icolor] - left = panel_pos[2][2 * icolor] bottom = panel_pos[0][0] - width = draw_region_width / figure_width height = panel_pos[1][0] - bottom Y_sort = Y if not is_color_like(color) and sort_order: @@ -742,7 +742,7 @@ def scatter_base( rectangle = [left, bottom, width, height] fig = pl.gcf() ax_cb = fig.add_axes(rectangle) - cb = pl.colorbar( + _ = pl.colorbar( sct, format=ticker.FuncFormatter(ticks_formatter), cax=ax_cb ) # set the title @@ -939,8 +939,8 @@ def make_pos(pos, node=root, currentLevel=0, parent=None, vert_loc=0): if levels is None: levels = make_levels({}) else: - levels = {l: {TOTAL: levels[l], CURRENT: 0} for l in levels} - vert_gap = height / (max([l for l in levels]) + 1) + levels = {k: {TOTAL: v, CURRENT: 0} for k, v in levels.items()} + vert_gap = height / (max(levels.keys()) + 1) return make_pos({}) diff --git a/scanpy/plotting/palettes.py b/scanpy/plotting/palettes.py index 7086549a4c..3ea963de99 100644 --- a/scanpy/plotting/palettes.py +++ b/scanpy/plotting/palettes.py @@ -1,5 +1,6 @@ """Color palettes in addition to matplotlib's palettes.""" +from typing import Mapping, Sequence from matplotlib import cm, colors # Colorblindness adjusted vega_10 @@ -180,9 +181,6 @@ default_102 = godsnot_102 -from typing import Mapping, Sequence - - def _plot_color_cycle(clists: Mapping[str, Sequence[str]]): import numpy as np import matplotlib.pyplot as plt diff --git a/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/scanpy/preprocessing/_deprecated/highly_variable_genes.py index 570a5f9f25..88454f08f8 100644 --- a/scanpy/preprocessing/_deprecated/highly_variable_genes.py +++ b/scanpy/preprocessing/_deprecated/highly_variable_genes.py @@ -170,7 +170,8 @@ def filter_genes_dispersion( disp_mean_bin[one_gene_per_bin] = 0 # actually do the normalization df['dispersion_norm'] = ( - df['dispersion'].values # use values here as index differs + # use values here as index differs + df['dispersion'].values - disp_mean_bin[df['mean_bin'].values].values ) / disp_std_bin[df['mean_bin'].values].values elif flavor == 'cell_ranger': diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index dec4449534..17a36d6520 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -57,6 +57,7 @@ def _highly_variable_genes_seurat_v3( ) X = adata.layers[layer] if layer is not None else adata.X + if check_nonnegative_integers(X) is False: raise ValueError( "`pp.highly_variable_genes` with `flavor='seurat_v3'` expects " @@ -194,7 +195,6 @@ def _highly_variable_genes_single_batch( A DataFrame that contains the columns `highly_variable`, `means`, `dispersions`, and `dispersions_norm`. """ - X = adata.layers[layer] if layer is not None else adata.X if flavor == 'seurat': if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None: @@ -265,7 +265,7 @@ def _highly_variable_genes_single_batch( ::-1 ].sort() # interestingly, np.argpartition is slightly slower if n_top_genes > adata.n_vars: - logg.info(f'`n_top_genes` > `adata.n_var`, returning all genes.') + logg.info('`n_top_genes` > `adata.n_var`, returning all genes.') n_top_genes = adata.n_vars disp_cut_off = dispersion_norm[n_top_genes - 1] gene_subset = np.nan_to_num(df['dispersions_norm'].values) >= disp_cut_off diff --git a/scanpy/preprocessing/_pca.py b/scanpy/preprocessing/_pca.py index 9ab94cfae2..fc4ec679cc 100644 --- a/scanpy/preprocessing/_pca.py +++ b/scanpy/preprocessing/_pca.py @@ -113,7 +113,7 @@ def pca( Explained variance, equivalent to the eigenvalues of the covariance matrix. """ - logg_start = logg.info(f'computing PCA') + logg_start = logg.info('computing PCA') # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index b10f0bfc84..866e8e3183 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -551,7 +551,7 @@ def normalize_per_cell( # proceed with data matrix X = data.copy() if copy else data if counts_per_cell is None: - if copy == False: + if not copy: raise ValueError('Can only be run with copy=True') cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts) X = X[cell_subset] @@ -752,7 +752,11 @@ def scale( annotated with `'mean'` and `'std'` in `adata.var`. """ _check_array_function_arguments(layer=layer, obsm=obsm) - return scale_array(data, zero_center=zero_center, max_value=max_value, copy=copy) + if layer is not None: + raise ValueError(f"`layer` argument inappropriate for value of type {type(X)}") + if obsm is not None: + raise ValueError(f"`obsm` argument inappropriate for value of type {type(X)}") + return scale_array(X, zero_center=zero_center, max_value=max_value, copy=copy) @scale.register(np.ndarray) diff --git a/scanpy/readwrite.py b/scanpy/readwrite.py index 96c9730f67..87d2055fc4 100644 --- a/scanpy/readwrite.py +++ b/scanpy/readwrite.py @@ -815,9 +815,9 @@ def _read_softgz(filename: Union[str, bytes, Path, BinaryIO]) -> AnnData: # Next line is the column headers (sample id's) sample_names = file.readline().strip().split("\t") # The column indices that contain gene expression data - I = [i for i, x in enumerate(sample_names) if x.startswith("GSM")] + indices = [i for i, x in enumerate(sample_names) if x.startswith("GSM")] # Restrict the column headers to those that we keep - sample_names = [sample_names[i] for i in I] + sample_names = [sample_names[i] for i in indices] # Get a list of sample labels groups = [samples_info[k] for k in sample_names] # Read the gene expression data as a list of lists, also get the gene @@ -831,7 +831,7 @@ def _read_softgz(filename: Union[str, bytes, Path, BinaryIO]) -> AnnData: V = line.split("\t") # Extract the values that correspond to gene expression measures # and convert the strings to numbers - x = [float(V[i]) for i in I] + x = [float(V[i]) for i in indices] X.append(x) gene_names.append(V[1]) # Convert the Python list of lists to a Numpy array and transpose to match @@ -914,7 +914,7 @@ def get_used_files(): filenames.append(nt.path) # This catches a race condition where a process ends # before we can examine its files - except psutil.NoSuchProcess as err: + except psutil.NoSuchProcess: pass return set(filenames) diff --git a/scanpy/tests/conftest.py b/scanpy/tests/conftest.py index a948488373..667354c591 100644 --- a/scanpy/tests/conftest.py +++ b/scanpy/tests/conftest.py @@ -10,6 +10,7 @@ import scanpy + scanpy.settings.verbosity = "hint" # define this after importing scanpy but before running tests diff --git a/scanpy/tests/external/test_scrublet.py b/scanpy/tests/external/test_scrublet.py index 83e15d9ca4..e09a3702c0 100644 --- a/scanpy/tests/external/test_scrublet.py +++ b/scanpy/tests/external/test_scrublet.py @@ -16,8 +16,6 @@ def test_scrublet(): adata = sc.datasets.pbmc3k() sce.pp.scrublet(adata, use_approx_neighbors=False) - errors = [] - # replace assertions by conditions assert "predicted_doublet" in adata.obs.columns assert "doublet_score" in adata.obs.columns @@ -36,8 +34,6 @@ def test_scrublet_dense(): adata = sc.datasets.paul15()[:500].copy() sce.pp.scrublet(adata, use_approx_neighbors=False) - errors = [] - # replace assertions by conditions assert "predicted_doublet" in adata.obs.columns assert "doublet_score" in adata.obs.columns @@ -102,7 +98,7 @@ def test_scrublet_simulate_doublets(): sc.pp.normalize_total(adata_obs) logged = sc.pp.log1p(adata_obs, copy=True) - hvg = sc.pp.highly_variable_genes(logged) + _ = sc.pp.highly_variable_genes(logged) adata_obs = adata_obs[:, logged.var['highly_variable']] adata_sim = sce.pp.scrublet_simulate_doublets(adata_obs, layer='raw') diff --git a/scanpy/tests/test_embedding_plots.py b/scanpy/tests/test_embedding_plots.py index 841ff63a8c..49746e45c3 100644 --- a/scanpy/tests/test_embedding_plots.py +++ b/scanpy/tests/test_embedding_plots.py @@ -200,7 +200,7 @@ def test_enumerated_palettes(fixture_request, adata, tmpdir, plotfunc): check_images(dict_pth, list_pth, tol=15) -## Spatial specific +# Spatial specific def test_visium_circles(image_comparer): # standard visium data diff --git a/scanpy/tests/test_logging.py b/scanpy/tests/test_logging.py index a954d85421..0da37040bc 100644 --- a/scanpy/tests/test_logging.py +++ b/scanpy/tests/test_logging.py @@ -5,7 +5,7 @@ import pytest -from scanpy import Verbosity, settings as s, logging as l +from scanpy import Verbosity, settings as s, logging as log import scanpy as sc @@ -24,29 +24,29 @@ def test_defaults(): def test_formats(capsys, logging_state): s.logfile = sys.stderr s.verbosity = Verbosity.debug - l.error('0') + log.error('0') assert capsys.readouterr().err == 'ERROR: 0\n' - l.warning('1') + log.warning('1') assert capsys.readouterr().err == 'WARNING: 1\n' - l.info('2') + log.info('2') assert capsys.readouterr().err == '2\n' - l.hint('3') + log.hint('3') assert capsys.readouterr().err == '--> 3\n' - l.debug('4') + log.debug('4') assert capsys.readouterr().err == ' 4\n' def test_deep(capsys, logging_state): s.logfile = sys.stderr s.verbosity = Verbosity.hint - l.hint('0') + log.hint('0') assert capsys.readouterr().err == '--> 0\n' - l.hint('1', deep='1!') + log.hint('1', deep='1!') assert capsys.readouterr().err == '--> 1\n' s.verbosity = Verbosity.debug - l.hint('2') + log.hint('2') assert capsys.readouterr().err == '--> 2\n' - l.hint('3', deep='3!') + log.hint('3', deep='3!') assert capsys.readouterr().err == '--> 3: 3!\n' @@ -57,15 +57,15 @@ def test_logfile(tmp_path, logging_state): s.logfile = io assert s.logfile is io assert s.logpath is None - l.error('test!') + log.error('test!') assert io.getvalue() == 'ERROR: test!\n' p = tmp_path / 'test.log' s.logpath = p assert s.logpath == p assert s.logfile.name == str(p) - l.hint('test2') - l.debug('invisible') + log.hint('test2') + log.debug('invisible') assert s.logpath.read_text() == '--> test2\n' @@ -80,18 +80,18 @@ def now(tz): counter += 1 return datetime(2000, 1, 1, second=counter, microsecond=counter, tzinfo=tz) - monkeypatch.setattr(l, 'datetime', IncTime) + monkeypatch.setattr(log, 'datetime', IncTime) s.verbosity = Verbosity.debug - l.hint('1') + log.hint('1') assert counter == 1 and capsys.readouterr().err == '--> 1\n' - start = l.info('2') + start = log.info('2') assert counter == 2 and capsys.readouterr().err == '2\n' - l.hint('3') + log.hint('3') assert counter == 3 and capsys.readouterr().err == '--> 3\n' - l.info('4', time=start) + log.info('4', time=start) assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n' - l.info('5 {time_passed}', time=start) + log.info('5 {time_passed}', time=start) assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n' diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py index 36b6fab362..7d7837a9a0 100644 --- a/scanpy/tests/test_pca.py +++ b/scanpy/tests/test_pca.py @@ -1,8 +1,6 @@ import pytest import numpy as np from anndata import AnnData -from scipy.sparse import csr_matrix -from scipy import sparse import scanpy as sc from scanpy.tests.fixtures import array_type, float_dtype diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py index 366402e23c..4773817fe1 100644 --- a/scanpy/tests/test_plotting.py +++ b/scanpy/tests/test_plotting.py @@ -435,7 +435,7 @@ def test_multiple_plots(image_comparer): fig, (ax1, ax2, ax3) = plt.subplots( 1, 3, figsize=(20, 5), gridspec_kw={'wspace': 0.7} ) - __ = sc.pl.stacked_violin( + _ = sc.pl.stacked_violin( adata, markers, groupby='bulk_labels', @@ -444,7 +444,7 @@ def test_multiple_plots(image_comparer): dendrogram=True, show=False, ) - __ = sc.pl.dotplot( + _ = sc.pl.dotplot( adata, markers, groupby='bulk_labels', @@ -453,7 +453,7 @@ def test_multiple_plots(image_comparer): dendrogram=True, show=False, ) - __ = sc.pl.matrixplot( + _ = sc.pl.matrixplot( adata, markers, groupby='bulk_labels', diff --git a/scanpy/tests/test_preprocessing_distributed.py b/scanpy/tests/test_preprocessing_distributed.py index b7d91a063e..871656c1ce 100644 --- a/scanpy/tests/test_preprocessing_distributed.py +++ b/scanpy/tests/test_preprocessing_distributed.py @@ -5,9 +5,9 @@ import numpy.testing as npt import pytest -from scanpy.preprocessing import * -from scanpy.preprocessing._simple import materialize_as_ndarray - +from scanpy.preprocessing import normalize_total, filter_genes +from scanpy.preprocessing import log1p, normalize_per_cell, filter_cells +from scanpy.preprocessing._distributed import materialize_as_ndarray HERE = Path(__file__).parent / Path('_data/') input_file = str(Path(HERE, "10x-10k-subset.zarr")) diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py index 9a0bf835b2..4eae8bae94 100644 --- a/scanpy/tests/test_scaling.py +++ b/scanpy/tests/test_scaling.py @@ -26,7 +26,7 @@ @pytest.mark.parametrize('typ', [np.array, csr_matrix], ids=lambda x: x.__name__) @pytest.mark.parametrize('dtype', ['float32', 'int64']) def test_scale(typ, dtype): - ## test AnnData arguments + # test AnnData arguments # test scaling with default zero_center == True adata0 = AnnData(typ(X), dtype=dtype) sc.pp.scale(adata0) @@ -39,7 +39,7 @@ def test_scale(typ, dtype): adata2 = AnnData(typ(X), dtype=dtype) sc.pp.scale(adata2, zero_center=False) assert np.allclose(csr_matrix(adata2.X).toarray(), X_scaled) - ## test bare count arguments, for simplicity only with explicit copy=True + # test bare count arguments, for simplicity only with explicit copy=True # test scaling with default zero_center == True data0 = typ(X, dtype=dtype) cdata0 = sc.pp.scale(data0, copy=True) diff --git a/scanpy/tools/_diffmap.py b/scanpy/tools/_diffmap.py index 9fda35e2a7..e5b5b8b8f4 100644 --- a/scanpy/tools/_diffmap.py +++ b/scanpy/tools/_diffmap.py @@ -52,6 +52,13 @@ def diffmap( `diffmap_evals` : :class:`numpy.ndarray` (`adata.uns`) Array of size (number of eigen vectors). Eigenvalues of transition matrix. + + Notes + ----- + The 0-th column in `adata.obsm["X_diffmap"]` is the steady-state solution, + which is non-informative in diffusion maps. + Therefore, the first diffusion component is at index 1, + e.g. `adata.obsm["X_diffmap"][:,1]` """ if neighbors_key is None: neighbors_key = 'neighbors' diff --git a/scanpy/tools/_dpt.py b/scanpy/tools/_dpt.py index f2060eeb9b..716987f726 100644 --- a/scanpy/tools/_dpt.py +++ b/scanpy/tools/_dpt.py @@ -342,7 +342,9 @@ def check_adjacency(self): for n_edges in range(1, np.max(n_edges_per_seg) + 1): for iseg in range(self.segs_adjacency.shape[0]): if n_edges_per_seg[iseg] == n_edges: - neighbor_segs = self.segs_adjacency[iseg].todense().A1 + neighbor_segs = ( # noqa: F841 TODO Evaluate whether to assign the variable or not + self.segs_adjacency[iseg].todense().A1 + ) closest_points_other_segs = [ seg[np.argmin(self.distances_dpt[self.segs_tips[iseg][0], seg])] for seg in self.segs @@ -593,7 +595,9 @@ def detect_branching( for iseg, seg_connects in enumerate(ssegs_connects) if iseg != trunk ] - prev_connecting_points = segs_connects[iseg] + prev_connecting_points = segs_connects[ # noqa: F841 TODO Evaluate whether to assign the variable or not + iseg + ] for jseg_cnt, jseg in enumerate(prev_connecting_segments): iseg_cnt = 0 for iseg_new, seg_new in enumerate(ssegs): @@ -904,8 +908,8 @@ def _detect_branching_single_haghverdi16(self, Dseg, tips): ps = [ [0, 1, 2], # start by computing distances from the first tip [1, 2, 0], # -"- second tip - [2, 0, 1], - ] # -"- third tip + [2, 0, 1], # -"- third tip + ] for i, p in enumerate(ps): ssegs.append(self.__detect_branching_haghverdi16(Dseg, tips[p])) return ssegs @@ -984,7 +988,7 @@ def __detect_branching_haghverdi16( Dseg[tips[0]][idcs] + Dseg[tips[1]][idcs] + Dseg[tips[2]][idcs] ) # init list to store new segments - ssegs = [] + ssegs = [] # noqa: F841 # TODO Look into this # first new segment: all points until, but excluding the branching point # increasing the following slightly from imax is a more conservative choice # as the criterion based on normalized distances, which follows below, diff --git a/scanpy/tools/_paga.py b/scanpy/tools/_paga.py index dea2f6f552..135c0ec6f9 100644 --- a/scanpy/tools/_paga.py +++ b/scanpy/tools/_paga.py @@ -1,4 +1,3 @@ -from collections import namedtuple from typing import List, Optional, NamedTuple import numpy as np diff --git a/scanpy/tools/_sim.py b/scanpy/tools/_sim.py index 98e88f7d70..32f7099fa4 100644 --- a/scanpy/tools/_sim.py +++ b/scanpy/tools/_sim.py @@ -207,7 +207,6 @@ def sample_dynamic_data(**params): step = 5 grnsim = GRNsim(dim=dim, initType=initType, model=model_key, params=params) - curr_nrSamples = 0 Xsamples = [] for sample in range(maxNrSamples): # choose initial conditions such that branchings result @@ -410,9 +409,8 @@ def __init__( # checks if initType not in ['branch', 'random']: raise RuntimeError('initType must be either: branch, random') - read = False if model not in self.availModels.keys(): - message = 'model not among predefined models \n' + message = 'model not among predefined models \n' # noqa: F841 # TODO FIX # read from file from .. import sim_models @@ -772,13 +770,12 @@ def branch_init_model1(self, tmax=100): if np.min(X0mean) < 0.025 or np.max(X0mean) > 0.975: settings.m(0, '... initial point is too close to bounds') return None - # if self.show and self.verbosity > 1: - pl.figure() - pl.plot(XbackUp[:, 0], '.b', XbackUp[:, 1], '.g') - pl.plot(XbackDo[:, 0], '.b', XbackDo[:, 1], '.g') - pl.plot(Xup[:, 0], 'b', Xup[:, 1], 'g') - pl.plot(Xdo[:, 0], 'b', Xdo[:, 1], 'g') + pl.figure() # noqa: F821 TODO Fix me + pl.plot(XbackUp[:, 0], '.b', XbackUp[:, 1], '.g') # noqa: F821 TODO Fix me + pl.plot(XbackDo[:, 0], '.b', XbackDo[:, 1], '.g') # noqa: F821 TODO Fix me + pl.plot(Xup[:, 0], 'b', Xup[:, 1], 'g') # noqa: F821 TODO Fix me + pl.plot(Xdo[:, 0], 'b', Xdo[:, 1], 'g') # noqa: F821 TODO Fix me return X0mean def parents_from_boolRule(self, rule): @@ -845,7 +842,7 @@ def build_boolCoeff(self): if self.verbosity > 1: settings.m(0, '...' + key) settings.m(0, rule) - settings.m(0, rule_pa) + settings.m(0, rule_pa) # noqa: F821 # now evaluate coefficients for tuple in list( itertools.product([False, True], repeat=len(self.pas[key])) @@ -1078,7 +1075,7 @@ def sim_givenAdj(self, Adj: np.ndarray, model='line'): Data array of shape (n_samples,dim). """ # nice examples - examples = [ + examples = [ # noqa: F841 TODO We are really unsure whether this is needed. dict( func='sawtooth', gdist='uniform', @@ -1174,11 +1171,13 @@ def sim_combi(self): # AND type / horizontal X[:, 2] = func(X[:, 0]) * sp.stats.norm.cdf(X[:, 1], 1, 0.2) - pl.scatter(X[:, 0], X[:, 1], c=X[:, 2], edgecolor='face') - pl.show() + pl.scatter( # noqa: F821 TODO Fix me + X[:, 0], X[:, 1], c=X[:, 2], edgecolor='face' + ) + pl.show() # noqa: F821 TODO Fix me - pl.plot(X[:, 1], X[:, 2], '.') - pl.show() + pl.plot(X[:, 1], X[:, 2], '.') # noqa: F821 TODO Fix me + pl.show() # noqa: F821 TODO Fix me return X diff --git a/scanpy/tools/_top_genes.py b/scanpy/tools/_top_genes.py index 439181b081..28a9390590 100644 --- a/scanpy/tools/_top_genes.py +++ b/scanpy/tools/_top_genes.py @@ -68,8 +68,8 @@ def correlation_matrix( # TODO: At the moment, only works for int identifiers - ### If no genes are passed, selects ranked genes from sample annotation. - ### At the moment, only calculate one table (Think about what comes next) + # If no genes are passed, selects ranked genes from sample annotation. + # At the moment, only calculate one table (Think about what comes next) if name_list is None: name_list = list() for j, k in enumerate(adata.uns['rank_genes_groups_gene_names']): @@ -128,7 +128,7 @@ def ROC_AUC_analysis( Calculate correlation matrix. Calculate a correlation matrix for genes strored in sample annotation - + Parameters ---------- adata @@ -159,7 +159,6 @@ def ROC_AUC_analysis( groups_order, groups_masks = select_groups(adata, groups, groupby) # Use usual convention, better for looping later. - imask = group mask = groups_masks[group] # TODO: Allow for sample weighting requires better mask access... later @@ -192,11 +191,11 @@ def ROC_AUC_analysis( def subsampled_estimates(mask, mask_rest=None, precision=0.01, probability=0.99): - ## Simple method that can be called by rank_gene_group. It uses masks that have been passed to the function and - ## calculates how much has to be subsampled in order to reach a certain precision with a certain probability - ## Then it subsamples for mask, mask rest - ## Since convergence speed varies, we take the slower one, i.e. the variance. This might have future speed-up - ## potential + # Simple method that can be called by rank_gene_group. It uses masks that have been passed to the function and + # calculates how much has to be subsampled in order to reach a certain precision with a certain probability + # Then it subsamples for mask, mask rest + # Since convergence speed varies, we take the slower one, i.e. the variance. This might have future speed-up + # potential if mask_rest is None: mask_rest = ~mask # TODO: DO precision calculation for mean variance shared @@ -205,16 +204,16 @@ def subsampled_estimates(mask, mask_rest=None, precision=0.01, probability=0.99) def dominated_ROC_elimination(adata, grouby): - ## This tool has the purpose to take a set of genes (possibly already pre-selected) and analyze AUC. - ## Those and only those are eliminated who are dominated completely - ## TODO: Potentially (But not till tomorrow), this can be adapted to only consider the AUC in the given - ## TODO: optimization frame + # This tool has the purpose to take a set of genes (possibly already pre-selected) and analyze AUC. + # Those and only those are eliminated who are dominated completely + # TODO: Potentially (But not till tomorrow), this can be adapted to only consider the AUC in the given + # TODO: optimization frame pass def _gene_preselection(adata, mask, thresholds): - ## This tool serves to - ## It is not thought to be addressed directly but rather using rank_genes_group or ROC analysis or comparable - ## TODO: Pass back a truncated adata object with only those genes that fullfill thresholding criterias - ## This function should be accessible by both rank_genes_groups and ROC_curve analysis + # This tool serves to + # It is not thought to be addressed directly but rather using rank_genes_group or ROC analysis or comparable + # TODO: Pass back a truncated adata object with only those genes that fullfill thresholding criterias + # This function should be accessible by both rank_genes_groups and ROC_curve analysis pass From 2088ed3efb1ee16d5a45e2c10519cc1aedd125e2 Mon Sep 17 00:00:00 2001 From: Jan Lause <34481813+jlause@users.noreply.github.com> Date: Fri, 19 Mar 2021 07:22:02 +0100 Subject: [PATCH 2/3] bugfix for #1725 (#1732) * fixes #1725 * removed unneeded mean_fulldataset/var_fulldataset variables * release note and tests * making variance dtype explicit in test * Minor cleanup -- ivirshup Co-authored-by: Isaac Virshup --- docs/release-notes/1.7.2.rst | 1 + .../preprocessing/_highly_variable_genes.py | 42 +++++++++---------- scanpy/tests/test_highly_variable_genes.py | 22 ++++++++++ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/docs/release-notes/1.7.2.rst b/docs/release-notes/1.7.2.rst index 4432dcf9d6..034da2fb61 100644 --- a/docs/release-notes/1.7.2.rst +++ b/docs/release-notes/1.7.2.rst @@ -7,6 +7,7 @@ - :func:`scanpy.logging.print_versions` now works when `python<3.8` :pr:`1691` :smaller:`I Virshup` - :func:`scanpy.pp.regress_out` now uses `joblib` as the parallel backend, and should stop oversubscribing threads :pr:`1694` :smaller:`I Virshup` +- :func:`scanpy.pp.highly_variable_genes` with `flavor="seurat_v3"` now returns correct gene means and -variances when used with `batch_key` :pr:`1732` :smaller:`J Lause` .. rubric:: Deprecations diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 17a36d6520..4618f9beca 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -55,7 +55,7 @@ def _highly_variable_genes_seurat_v3( raise ImportError( 'Please install skmisc package via `pip install --user scikit-misc' ) - + df = pd.DataFrame(index=adata.var_names) X = adata.layers[layer] if layer is not None else adata.X if check_nonnegative_integers(X) is False: @@ -64,6 +64,8 @@ def _highly_variable_genes_seurat_v3( "raw count data." ) + df['means'], df['variances'] = _get_mean_var(X) + if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) else: @@ -71,13 +73,11 @@ def _highly_variable_genes_seurat_v3( norm_gene_vars = [] for b in np.unique(batch_info): + X_batch = X[batch_info == b] - ad = adata[batch_info == b] - X = ad.layers[layer] if layer is not None else ad.X - - mean, var = _get_mean_var(X) + mean, var = _get_mean_var(X_batch) not_const = var > 0 - estimat_var = np.zeros(adata.shape[1], dtype=np.float64) + estimat_var = np.zeros(X.shape[1], dtype=np.float64) y = np.log10(var[not_const]) x = np.log10(mean[not_const]) @@ -86,15 +86,18 @@ def _highly_variable_genes_seurat_v3( estimat_var[not_const] = model.outputs.fitted_values reg_std = np.sqrt(10 ** estimat_var) - batch_counts = X.astype(np.float64).copy() + batch_counts = X_batch.astype(np.float64).copy() # clip large values as in Seurat - N = np.sum(batch_info == b) + N = X_batch.shape[0] vmax = np.sqrt(N) clip_val = reg_std * vmax + mean if sp_sparse.issparse(batch_counts): batch_counts = sp_sparse.csr_matrix(batch_counts) mask = batch_counts.data > clip_val[batch_counts.indices] batch_counts.data[mask] = clip_val[batch_counts.indices[mask]] + + squared_batch_counts_sum = np.array(batch_counts.power(2).sum(axis=0)) + batch_counts_sum = np.array(batch_counts.sum(axis=0)) else: clip_val_broad = np.broadcast_to(clip_val, batch_counts.shape) np.putmask( @@ -103,10 +106,6 @@ def _highly_variable_genes_seurat_v3( clip_val_broad, ) - if sp_sparse.issparse(batch_counts): - squared_batch_counts_sum = np.array(batch_counts.power(2).sum(axis=0)) - batch_counts_sum = np.array(batch_counts.sum(axis=0)) - else: squared_batch_counts_sum = np.square(batch_counts).sum(axis=0) batch_counts_sum = batch_counts.sum(axis=0) @@ -130,22 +129,21 @@ def _highly_variable_genes_seurat_v3( ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) - df = pd.DataFrame(index=np.array(adata.var_names)) df['highly_variable_nbatches'] = num_batches_high_var df['highly_variable_rank'] = median_ranked df['variances_norm'] = np.mean(norm_gene_vars, axis=0) - df['means'] = mean - df['variances'] = var - df.sort_values( - ['highly_variable_rank', 'highly_variable_nbatches'], - ascending=[True, False], - na_position='last', - inplace=True, + sorted_index = ( + df[['highly_variable_rank', 'highly_variable_nbatches']] + .sort_values( + ['highly_variable_rank', 'highly_variable_nbatches'], + ascending=[True, False], + na_position='last', + ) + .index ) df['highly_variable'] = False - df.loc[: int(n_top_genes), 'highly_variable'] = True - df = df.loc[adata.var_names] + df.loc[sorted_index[: int(n_top_genes)], 'highly_variable'] = True if inplace or subset: adata.uns['hvg'] = {'flavor': 'seurat_v3'} diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 8b47d5357e..dfabf267db 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -238,3 +238,25 @@ def test_highly_variable_genes_batches(): ] assert np.all(np.isin(colnames, hvg1.columns)) + + +from scanpy.preprocessing._utils import _get_mean_var + + +def test_seurat_v3_mean_var_output_with_batchkey(): + pbmc = sc.datasets.pbmc3k() + pbmc.var_names_make_unique() + n_cells = pbmc.shape[0] + batch = np.zeros((n_cells), dtype=int) + batch[1500:] = 1 + pbmc.obs["batch"] = batch + + # true_mean, true_var = _get_mean_var(pbmc.X) + true_mean = np.mean(pbmc.X.toarray(), axis=0) + true_var = np.var(pbmc.X.toarray(), axis=0, dtype=np.float64, ddof=1) + + result_df = sc.pp.highly_variable_genes( + pbmc, batch_key='batch', flavor='seurat_v3', n_top_genes=4000, inplace=False + ) + np.testing.assert_allclose(true_mean, result_df['means'], rtol=2e-05, atol=2e-05) + np.testing.assert_allclose(true_var, result_df['variances'], rtol=2e-05, atol=2e-05) From 9b0d551df3f45b4f0f724ba92e89fc0e08f7c6da Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 19 Mar 2021 18:29:53 +1100 Subject: [PATCH 3/3] Run pre-commit over everything --- .flake8 | 1 + scanpy/external/pp/_scvi.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.flake8 b/.flake8 index 63a2344286..555b6aecc2 100644 --- a/.flake8 +++ b/.flake8 @@ -34,4 +34,5 @@ exclude = build, docs/_build, dist, + scanpy/api/*, diff --git a/scanpy/external/pp/_scvi.py b/scanpy/external/pp/_scvi.py index baf47e684d..f4f75b0607 100644 --- a/scanpy/external/pp/_scvi.py +++ b/scanpy/external/pp/_scvi.py @@ -33,12 +33,12 @@ def scvi( Fits scVI model onto raw count data given an anndata object - scVI uses stochastic optimization and deep neural networks to aggregate information + scVI uses stochastic optimization and deep neural networks to aggregate information across similar cells and genes and to approximate the distributions that underlie observed expression values, while accounting for batch effects and limited sensitivity. To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.), - set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can + set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can be used to inspect which genes contribute to variation in the dataset. It may also be used for all scVI tasks, like differential expression, batch correction, imputation, etc. However, batch correction may be less powerful as it assumes a linear model. @@ -69,13 +69,13 @@ def scvi( train_size The train size, either a float between 0 and 1 or an integer for the number of training samples to use batch_key - Column name in anndata.obs for batches. + Column name in anndata.obs for batches. If None, no batch correction is performed If not None, batch correction is performed per batch category use_highly_variable_genes If true, uses only the genes in anndata.var["highly_variable"] subset_genes - Optional list of indices or gene names to subset anndata. + Optional list of indices or gene names to subset anndata. If not None, use_highly_variable_genes is ignored linear_decoder If true, uses LDVAE model, which is an implementation of [Svensson20]_. @@ -89,18 +89,18 @@ def scvi( Extra arguments for UnsupervisedTrainer model_kwargs Extra arguments for VAE or LDVAE model - + Returns ------- If `copy` is true, anndata is returned. If `return_posterior` is true, the posterior object is returned - If both `copy` and `return_posterior` are true, - a tuple of anndata and the posterior are returned in that order. + If both `copy` and `return_posterior` are true, + a tuple of anndata and the posterior are returned in that order. `adata.obsm['X_scvi']` stores the latent representations `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial - + If linear_decoder is true: `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a genes by n_latent matrix.