From 81daa092e0fbb741d1144df8c93deeadf3355ba9 Mon Sep 17 00:00:00 2001
From: Lukas Heumos <lukas.heumos@posteo.net>
Date: Thu, 18 Mar 2021 10:14:27 +0100
Subject: [PATCH 1/3] Add pre-commit changes

---
 .flake8                                       | 37 +++++++++++
 .pre-commit-config.yaml                       |  9 +++
 docs/dev/code.rst                             |  7 ++-
 scanpy/_utils.py                              |  6 +-
 scanpy/external/pl.py                         | 14 ++---
 scanpy/external/pp/_scrublet.py               | 62 +++++++++----------
 scanpy/external/tl/_trimap.py                 |  2 +-
 scanpy/get/get.py                             |  6 +-
 scanpy/neighbors/__init__.py                  | 34 +++++-----
 scanpy/plotting/__init__.py                   |  2 +-
 scanpy/plotting/_anndata.py                   |  6 +-
 scanpy/plotting/_baseplot_class.py            |  4 +-
 scanpy/plotting/_matrixplot.py                |  2 +-
 scanpy/plotting/_tools/__init__.py            |  6 +-
 scanpy/plotting/_tools/paga.py                |  6 +-
 scanpy/plotting/_tools/scatterplots.py        |  8 +--
 scanpy/plotting/_utils.py                     | 14 ++---
 scanpy/plotting/palettes.py                   |  4 +-
 .../_deprecated/highly_variable_genes.py      |  3 +-
 .../preprocessing/_highly_variable_genes.py   |  4 +-
 scanpy/preprocessing/_pca.py                  |  2 +-
 scanpy/preprocessing/_simple.py               |  8 ++-
 scanpy/readwrite.py                           |  8 +--
 scanpy/tests/conftest.py                      |  1 +
 scanpy/tests/external/test_scrublet.py        |  6 +-
 scanpy/tests/test_embedding_plots.py          |  2 +-
 scanpy/tests/test_logging.py                  | 38 ++++++------
 scanpy/tests/test_pca.py                      |  2 -
 scanpy/tests/test_plotting.py                 |  6 +-
 .../tests/test_preprocessing_distributed.py   |  6 +-
 scanpy/tests/test_scaling.py                  |  4 +-
 scanpy/tools/_diffmap.py                      |  7 +++
 scanpy/tools/_dpt.py                          | 14 +++--
 scanpy/tools/_paga.py                         |  1 -
 scanpy/tools/_sim.py                          | 29 +++++----
 scanpy/tools/_top_genes.py                    | 33 +++++-----
 36 files changed, 231 insertions(+), 172 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..63a2344286
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,37 @@
+# Can't yet be moved to the pyproject.toml due to https://gitlab.com/pycqa/flake8/-/issues/428#note_251982786
+[flake8]
+max-line-length = 88
+ignore = # module imported but unused -> required for Scanpys API
+         F401,
+         # line break before a binary operator -> black does not adhere to PEP8
+         W503,
+         # line break occured after a binary operator -> black does not adhere to PEP8
+         W504,
+         # line too long -> we accept long comment lines; black gets rid of long code lines
+         E501,
+         # whitespace before : -> black does not adhere to PEP8
+         E203,
+         # missing whitespace after ,', ';', or ':' -> black does not adhere to PEP8
+         E231,
+         # module level import not at top of file -> required to circumvent circular imports for Scanpys API
+         E402,
+         # continuation line over-indented for hanging indent -> black does not adhere to PEP8
+         E126,
+         # E266 too many leading '#' for block comment -> Scanpy allows them for comments into sections
+         E262,
+         # inline comment should start with '# ' -> Scanpy allows them for specific explanations
+         E266,
+         # Do not assign a lambda expression, use a def -> Scanpy allows lambda expression assignments,
+         E731,
+         # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation
+         E741
+ per-file-ignores =
+    # F811 Redefinition of unused name from line, does not play nice with pytest fixtures
+    tests/test*.py: F811
+exclude =
+    .git,
+    __pycache__,
+    build,
+    docs/_build,
+    dist,
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 93ae9e8fd7..95a96ae781 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,3 +3,12 @@ repos:
     rev: 20.8b1
     hooks:
     -   id: black
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.4
+    hooks:
+    -   id: flake8
+-   repo: https://github.com/pre-commit/mirrors-autopep8
+    rev: v1.5.5
+    hooks:
+    -   id: autopep8
+        args: ["-i"]
diff --git a/docs/dev/code.rst b/docs/dev/code.rst
index 218e73b4ae..4fc0a474f2 100644
--- a/docs/dev/code.rst
+++ b/docs/dev/code.rst
@@ -17,6 +17,11 @@ Code style
 
 New code should follow
 `Black <https://black.readthedocs.io/en/stable/the_black_code_style.html>`__
-and Scanpy’s
+and
+`flake8 <https://flake8.pycqa.org>`__.
+We ignore a couple of flake8 checks which are documented in the .flake8 file in the root of this repository.
+To learn how to ignore checks per line please read
+`flake8 violations <https://flake8.pycqa.org/en/latest/user/violations.html>`__.
+Additionally, we use Scanpy’s
 `EditorConfig <https://github.com/theislab/scanpy/blob/master/.editorconfig>`__,
 so using an editor/IDE with support for both is helpful.
diff --git a/scanpy/_utils.py b/scanpy/_utils.py
index 207f40f0b6..1aac2e925b 100644
--- a/scanpy/_utils.py
+++ b/scanpy/_utils.py
@@ -209,7 +209,7 @@ def get_igraph_from_adjacency(adjacency, directed=None):
     g.add_edges(list(zip(sources, targets)))
     try:
         g.es['weight'] = weights
-    except:
+    except KeyError:
         pass
     if g.vcount() != adjacency.shape[0]:
         logg.warning(
@@ -551,7 +551,9 @@ def warn_with_traceback(message, category, filename, lineno, file=None, line=Non
     import traceback
 
     traceback.print_stack()
-    log = file if hasattr(file, 'write') else sys.stderr
+    log = (  # noqa: F841  # TODO Does this need fixing?
+        file if hasattr(file, 'write') else sys.stderr
+    )
     settings.write(warnings.formatwarning(message, category, filename, lineno, line))
 
 
diff --git a/scanpy/external/pl.py b/scanpy/external/pl.py
index b35310591e..3a59e99b61 100644
--- a/scanpy/external/pl.py
+++ b/scanpy/external/pl.py
@@ -332,15 +332,15 @@ def scrublet_score_distribution(
     figsize: Optional[Tuple[float, float]] = (8, 3),
 ):
     """\
-    Plot histogram of doublet scores for observed transcriptomes and simulated doublets. 
+    Plot histogram of doublet scores for observed transcriptomes and simulated doublets.
+
+    The histogram for simulated doublets is useful for determining the correct doublet
+    score threshold.
 
-    The histogram for simulated doublets is useful for determining the correct doublet 
-    score threshold. 
-    
     Parameters
     ----------
     adata
-        An annData object resulting from func:`~scanpy.external.scrublet`.  
+        An annData object resulting from func:`~scanpy.external.scrublet`.
     scale_hist_obs
         Set y axis scale transformation in matplotlib for the plot of observed
         transcriptomes (e.g. "linear", "log", "symlog", "logit")
@@ -353,9 +353,9 @@ def scrublet_score_distribution(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs
-        preprocessing, doublet simulation (this function) and calling. 
+        preprocessing, doublet simulation (this function) and calling.
     :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet
-        simulation separately for advanced usage. 
+        simulation separately for advanced usage.
     """
 
     threshold = adata.uns['scrublet']['threshold']
diff --git a/scanpy/external/pp/_scrublet.py b/scanpy/external/pp/_scrublet.py
index 374f5b4fe9..6611fccac5 100644
--- a/scanpy/external/pp/_scrublet.py
+++ b/scanpy/external/pp/_scrublet.py
@@ -1,5 +1,5 @@
 from anndata import AnnData
-from typing import Collection, Tuple, Optional, Union
+from typing import Optional
 import numpy as np
 from scipy import sparse
 
@@ -40,7 +40,7 @@ def scrublet(
     and directly call functions of Scrublet(). You may also undertake your own
     preprocessing, simulate doublets with
     scanpy.external.pp.scrublet_simulate_doublets(), and run the core scrublet
-    function scanpy.external.pp.scrublet.scrublet(). 
+    function scanpy.external.pp.scrublet.scrublet().
 
     .. note::
         More information and bug reports `here
@@ -61,7 +61,7 @@ def scrublet(
         as adata. This should have been built from adata_obs after
         filtering genes and cells and selcting highly-variable genes.
     sim_doublet_ratio
-        Number of doublets to simulate relative to the number of observed 
+        Number of doublets to simulate relative to the number of observed
         transcriptomes.
     expected_doublet_rate
         Where adata_sim not suplied, the estimated doublet rate for the
@@ -73,8 +73,8 @@ def scrublet(
         synthetic doublets. If 1.0, each doublet is created by simply adding
         the UMI counts from two randomly sampled observed transcriptomes. For
         values less than 1, the UMI counts are added and then randomly sampled
-        at the specified rate. 
-    knn_dist_metric 
+        at the specified rate.
+    knn_dist_metric
         Distance metric used when finding nearest neighbors. For list of
         valid values, see the documentation for annoy (if `use_approx_neighbors`
         is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
@@ -90,16 +90,16 @@ def scrublet(
         If True, center the data such that each gene has a mean of 0.
         `sklearn.decomposition.PCA` will be used for dimensionality
         reduction.
-    n_prin_comps 
+    n_prin_comps
         Number of principal components used to embed the transcriptomes prior
-        to k-nearest-neighbor graph construction. 
+        to k-nearest-neighbor graph construction.
     use_approx_neighbors
-        Use approximate nearest neighbor method (annoy) for the KNN 
+        Use approximate nearest neighbor method (annoy) for the KNN
         classifier.
     get_doublet_neighbor_parents
         If True, return (in .uns) the parent transcriptomes that generated the
         doublet neighbors of each observed transcriptome. This information can
-        be used to infer the cell states that generated a given doublet state. 
+        be used to infer the cell states that generated a given doublet state.
     n_neighbors
         Number of neighbors used to construct the KNN graph of observed
         transcriptomes and simulated doublets. If ``None``, this is
@@ -133,7 +133,7 @@ def scrublet(
         ``adata.uns['scrublet']['doublet_scores_sim']``
             Doublet scores for each simulated doublet transcriptome
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet
             transcriptome
 
@@ -143,9 +143,9 @@ def scrublet(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet
-        simulation separately for advanced usage. 
+        simulation separately for advanced usage.
     :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
-        scores for observed transcriptomes and simulated doublets. 
+        scores for observed transcriptomes and simulated doublets.
     """
     try:
         import scrublet as sl
@@ -185,7 +185,7 @@ def scrublet(
             pp.highly_variable_genes(adata_obs, subset=True)
         else:
             logged = pp.log1p(adata_obs, copy=True)
-            hvg = pp.highly_variable_genes(logged)
+            _ = pp.highly_variable_genes(logged)
             adata_obs = adata_obs[:, logged.var['highly_variable']]
 
         # Simulate the doublets based on the raw expressions from the normalised
@@ -257,7 +257,7 @@ def _scrublet_call_doublets(
     transcriptomes and simulated doublets. This is a wrapper around the core
     functions of `Scrublet <https://github.com/swolock/scrublet>`__ to allow
     for flexibility in applying Scanpy filtering operations upstream. Unless
-    you know what you're doing you should use the main scrublet() function.    
+    you know what you're doing you should use the main scrublet() function.
 
     .. note::
         More information and bug reports `here
@@ -293,9 +293,9 @@ def _scrublet_call_doublets(
         reduction, unless `mean_center` is True.
     n_prin_comps
         Number of principal components used to embed the transcriptomes prior
-        to k-nearest-neighbor graph construction. 
+        to k-nearest-neighbor graph construction.
     use_approx_neighbors
-        Use approximate nearest neighbor method (annoy) for the KNN 
+        Use approximate nearest neighbor method (annoy) for the KNN
         classifier.
     knn_dist_metric
         Distance metric used when finding nearest neighbors. For list of
@@ -303,10 +303,10 @@ def _scrublet_call_doublets(
         is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
         is False).
     get_doublet_neighbor_parents
-        If True, return the parent transcriptomes that generated the 
-        doublet neighbors of each observed transcriptome. This information can 
-        be used to infer the cell states that generated a given 
-        doublet state. 
+        If True, return the parent transcriptomes that generated the
+        doublet neighbors of each observed transcriptome. This information can
+        be used to infer the cell states that generated a given
+        doublet state.
     threshold
         Doublet score threshold for calling a transcriptome a doublet. If
         `None`, this is set automatically by looking for the minimum between
@@ -316,7 +316,7 @@ def _scrublet_call_doublets(
         predicted doublets in a 2-D embedding.
     random_state
         Initial state for doublet simulation and nearest neighbors.
-    verbose 
+    verbose
         If True, print progress updates.
 
     Returns
@@ -333,7 +333,7 @@ def _scrublet_call_doublets(
         ``adata.uns['scrublet']['doublet_scores_sim']``
             Doublet scores for each simulated doublet transcriptome
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome
 
         ``uns['scrublet']['parameters']``
@@ -453,16 +453,16 @@ def scrublet_simulate_doublets(
         The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows
         correspond to cells and columns to genes. Genes should have been
         filtered for expression and variability, and the object should contain
-        raw expression of the same dimensions. 
+        raw expression of the same dimensions.
     layer
-        Layer of adata where raw values are stored, or 'X' if values are in .X. 
+        Layer of adata where raw values are stored, or 'X' if values are in .X.
     sim_doublet_ratio
-        Number of doublets to simulate relative to the number of observed 
+        Number of doublets to simulate relative to the number of observed
         transcriptomes. If `None`, self.sim_doublet_ratio is used.
     synthetic_doublet_umi_subsampling
-        Rate for sampling UMIs when creating synthetic doublets. If 1.0, 
-        each doublet is created by simply adding the UMIs from two randomly 
-        sampled observed transcriptomes. For values less than 1, the 
+        Rate for sampling UMIs when creating synthetic doublets. If 1.0,
+        each doublet is created by simply adding the UMIs from two randomly
+        sampled observed transcriptomes. For values less than 1, the
         UMI counts are added and then randomly sampled at the specified
         rate.
 
@@ -471,7 +471,7 @@ def scrublet_simulate_doublets(
     adata : anndata.AnnData with simulated doublets in .X
         if ``copy=True`` it returns or else adds fields to ``adata``:
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome
 
         ``uns['scrublet']['parameters']``
@@ -480,9 +480,9 @@ def scrublet_simulate_doublets(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs
-        preprocessing, doublet simulation (this function) and calling. 
+        preprocessing, doublet simulation (this function) and calling.
     :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
-        scores for observed transcriptomes and simulated doublets. 
+        scores for observed transcriptomes and simulated doublets.
     """
     try:
         import scrublet as sl
diff --git a/scanpy/external/tl/_trimap.py b/scanpy/external/tl/_trimap.py
index 5935d81f5a..5334e29fda 100644
--- a/scanpy/external/tl/_trimap.py
+++ b/scanpy/external/tl/_trimap.py
@@ -76,7 +76,7 @@ def trimap(
 
     Example
     -------
-    
+
     >>> import scanpy as sc
     >>> import scanpy.external as sce
     >>> pbmc = sc.datasets.pbmc68k_reduced()
diff --git a/scanpy/get/get.py b/scanpy/get/get.py
index 45c081687e..a90b84b70c 100644
--- a/scanpy/get/get.py
+++ b/scanpy/get/get.py
@@ -6,7 +6,7 @@
 from scipy.sparse import spmatrix
 
 from anndata import AnnData
-import warnings
+from .._compat import Literal
 
 # --------------------------------------------------------------------------------
 # Plotting data helpers
@@ -96,7 +96,7 @@ def rank_genes_groups_df(
 def _check_indices(
     dim_df: pd.DataFrame,
     alt_index: pd.Index,
-    dim: "Literal['obs', 'var']",
+    dim: Literal['obs', 'var'],
     keys: List[str],
     alias_index: Optional[pd.Index] = None,
     use_raw: bool = False,
@@ -176,7 +176,7 @@ def _get_array_values(
     X,
     dim_names: pd.Index,
     keys: List[str],
-    axis: "Literal[0, 1]",
+    axis: Literal[0, 1],
     backed: bool,
 ):
     # TODO: This should be made easier on the anndata side
diff --git a/scanpy/neighbors/__init__.py b/scanpy/neighbors/__init__.py
index 3e6d09c465..4caebbee8a 100644
--- a/scanpy/neighbors/__init__.py
+++ b/scanpy/neighbors/__init__.py
@@ -15,13 +15,11 @@
 from ..tools._utils import _choose_representation, doc_use_rep, doc_n_pcs
 from .. import settings
 
-
 N_DCS = 15  # default number of diffusion components
 N_PCS = (
     settings.N_PCS
 )  # Backwards compat, constants should be defined in only one place.
 
-
 _Method = Literal['umap', 'gauss', 'rapids']
 _MetricFn = Callable[[np.ndarray, np.ndarray], float]
 # from sklearn.metrics.pairwise_distances.__doc__:
@@ -126,6 +124,12 @@ def neighbors(
     **distances** : sparse matrix of dtype `float32`.
         Instead of decaying weights, this stores distances for each pair of
         neighbors.
+
+    Notes
+    -----
+    If `method='umap'`, it's highly recommended to install pynndescent ``pip install pynndescent``.
+    Installing `pynndescent` can significantly increase performance,
+    and in later versions it will become a hard dependency.
     """
     start = logg.info('computing neighbors')
     adata = adata.copy() if copy else adata
@@ -765,9 +769,7 @@ def compute_neighbors(
         self.knn = knn
         X = _choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs)
         # neighbor search
-        use_dense_distances = (
-            metric == 'euclidean' and X.shape[0] < 8192
-        ) or knn == False
+        use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or not knn
         if use_dense_distances:
             _distances = pairwise_distances(X, metric=metric, **metric_kwds)
             knn_indices, knn_distances = _get_indices_distances_from_dense_matrix(
@@ -793,7 +795,7 @@ def compute_neighbors(
             try:
                 if forest:
                     self._rp_forest = _make_forest_dict(forest)
-            except:
+            except Exception:  # TODO catch the correct exception
                 pass
         # write indices as attributes
         if write_knn_indices:
@@ -858,7 +860,7 @@ def _compute_connectivities_diffmap(self, density_normalize=True):
             # make the weight matrix sparse
             if not self.knn:
                 mask = W > 1e-14
-                W[mask == False] = 0
+                W[~mask] = 0
             else:
                 # restrict number of neighbors to ~k
                 # build a symmetric mask
@@ -870,7 +872,7 @@ def _compute_connectivities_diffmap(self, density_normalize=True):
                             W[j, i] = W[i, j]
                             mask[j, i] = True
                 # set all entries that are not nearest neighbors to zero
-                W[mask == False] = 0
+                W[~mask] = 0
         else:
             W = (
                 Dsq.copy()
@@ -1021,14 +1023,14 @@ def _get_dpt_row(self, i):
             mask = self._connected_components[1] == label
         row = sum(
             (
-                self.eigen_values[l]
-                / (1 - self.eigen_values[l])
-                * (self.eigen_basis[i, l] - self.eigen_basis[:, l])
+                self.eigen_values[j]
+                / (1 - self.eigen_values[j])
+                * (self.eigen_basis[i, j] - self.eigen_basis[:, j])
             )
             ** 2
             # account for float32 precision
-            for l in range(0, self.eigen_values.size)
-            if self.eigen_values[l] < 0.9994
+            for j in range(0, self.eigen_values.size)
+            if self.eigen_values[j] < 0.9994
         )
         # thanks to Marius Lange for pointing Alex to this:
         # we will likely remove the contributions from the stationary state below when making
@@ -1036,9 +1038,9 @@ def _get_dpt_row(self, i):
         # they never seem to have deteriorated results, but also other distance measures (see e.g.
         # PAGA paper) don't have it, which makes sense
         row += sum(
-            (self.eigen_basis[i, l] - self.eigen_basis[:, l]) ** 2
-            for l in range(0, self.eigen_values.size)
-            if self.eigen_values[l] >= 0.9994
+            (self.eigen_basis[i, k] - self.eigen_basis[:, k]) ** 2
+            for k in range(0, self.eigen_values.size)
+            if self.eigen_values[k] >= 0.9994
         )
         if mask is not None:
             row[~mask] = np.inf
diff --git a/scanpy/plotting/__init__.py b/scanpy/plotting/__init__.py
index 4ce775e44f..ef3aaecede 100644
--- a/scanpy/plotting/__init__.py
+++ b/scanpy/plotting/__init__.py
@@ -77,7 +77,7 @@
 Classes
 -------
 
-These classes allow fine tuning of visual parameters. 
+These classes allow fine tuning of visual parameters.
 
 .. autosummary::
    :toctree: .
diff --git a/scanpy/plotting/_anndata.py b/scanpy/plotting/_anndata.py
index 9e56069759..0ac8ba0e9b 100755
--- a/scanpy/plotting/_anndata.py
+++ b/scanpy/plotting/_anndata.py
@@ -552,7 +552,7 @@ def ranking(
         n_rows, n_cols = 1, n_panels
     else:
         n_rows, n_cols = 2, int(n_panels / 2 + 0.5)
-    fig = pl.figure(
+    _ = pl.figure(
         figsize=(
             n_cols * rcParams['figure.figsize'][0],
             n_rows * rcParams['figure.figsize'][1],
@@ -1474,7 +1474,7 @@ def tracksplot(
         ymin, ymax = ax.get_ylim()
         ymax = int(ymax)
         ax.set_yticks([ymax])
-        tt = ax.set_yticklabels([str(ymax)], ha='left', va='top')
+        ax.set_yticklabels([str(ymax)], ha='left', va='top')
         ax.spines['right'].set_position(('axes', 1.01))
         ax.tick_params(
             axis='y',
@@ -1972,7 +1972,7 @@ def _plot_gene_groups_brackets(
                     va='bottom',
                     rotation=rotation,
                 )
-            except:
+            except Exception:  # TODO catch the correct exception
                 pass
     else:
         top = left
diff --git a/scanpy/plotting/_baseplot_class.py b/scanpy/plotting/_baseplot_class.py
index 38a1a76da1..f8edc819dd 100644
--- a/scanpy/plotting/_baseplot_class.py
+++ b/scanpy/plotting/_baseplot_class.py
@@ -125,9 +125,9 @@ def __init__(
                     "the `order` parameter match the categories that "
                     "want to be reordered.\n\n"
                     "Mismatch: "
-                    f"{set(obs_tidy.index.categories).difference(categories_order)}\n\n"
+                    f"{set(self.obs_tidy.index.categories).difference(categories_order)}\n\n"
                     f"Given order categories: {categories_order}\n\n"
-                    f"{groupby} categories: {list(obs_tidy.index.categories)}\n"
+                    f"{groupby} categories: {list(self.obs_tidy.index.categories)}\n"
                 )
                 return
 
diff --git a/scanpy/plotting/_matrixplot.py b/scanpy/plotting/_matrixplot.py
index 6a225a440c..d95678ba06 100644
--- a/scanpy/plotting/_matrixplot.py
+++ b/scanpy/plotting/_matrixplot.py
@@ -219,7 +219,7 @@ def _mainplot(self, ax):
             linewidth=self.edge_lw,
             norm=normalize,
         )
-        __ = ax.pcolor(_color_df, **kwds)
+        _ = ax.pcolor(_color_df, **kwds)
 
         y_labels = _color_df.index
         x_labels = _color_df.columns
diff --git a/scanpy/plotting/_tools/__init__.py b/scanpy/plotting/_tools/__init__.py
index 7ff590a33a..c0d0723df7 100644
--- a/scanpy/plotting/_tools/__init__.py
+++ b/scanpy/plotting/_tools/__init__.py
@@ -55,7 +55,7 @@ def pca_overview(adata: AnnData, **params):
     show = params['show'] if 'show' in params else None
     if 'show' in params:
         del params['show']
-    scatterplots.pca(adata, **params, show=False)
+    pca(adata, **params, show=False)
     pca_loadings(adata, show=False)
     pca_variance_ratio(adata, show=show)
 
@@ -358,7 +358,7 @@ def _fig_show_save_or_axes(plot_obj, return_fig, show, save):
         plot_obj.make_figure()
         savefig_or_show(plot_obj.DEFAULT_SAVE_PREFIX, show=show, save=save)
         show = settings.autoshow if show is None else show
-        if not show:
+        if show is False:
             return plot_obj.get_axes()
 
 
@@ -960,7 +960,7 @@ def rank_genes_groups_violin(
         )
         savefig_or_show(writekey, show=show, save=save)
         axs.append(_ax)
-    if show == False:
+    if show is False:
         return axs
 
 
diff --git a/scanpy/plotting/_tools/paga.py b/scanpy/plotting/_tools/paga.py
index f5a0cfffe8..7eda05fe8f 100644
--- a/scanpy/plotting/_tools/paga.py
+++ b/scanpy/plotting/_tools/paga.py
@@ -147,7 +147,7 @@ def paga_compare(
     if suptitle is not None:
         pl.suptitle(suptitle)
     _utils.savefig_or_show('paga_compare', show=show, save=save)
-    if show == False:
+    if show is False:
         return axs
 
 
@@ -168,7 +168,7 @@ def _compute_pos(
     if layout == 'fa':
         try:
             from fa2 import ForceAtlas2
-        except:
+        except ImportError:
             logg.warning(
                 "Package 'fa2' is not installed, falling back to layout 'fr'."
                 'To use the faster and better ForceAtlas2 layout, '
@@ -578,7 +578,7 @@ def is_flat(x):
                 else:
                     ax_cb = cax[icolor]
 
-                cb = pl.colorbar(
+                _ = pl.colorbar(
                     sct,
                     format=ticker.FuncFormatter(_utils.ticks_formatter),
                     cax=ax_cb,
diff --git a/scanpy/plotting/_tools/scatterplots.py b/scanpy/plotting/_tools/scatterplots.py
index 8772a6fb82..52d290d150 100644
--- a/scanpy/plotting/_tools/scatterplots.py
+++ b/scanpy/plotting/_tools/scatterplots.py
@@ -220,7 +220,6 @@ def embedding(
     else:
         size = 120000 / adata.shape[0]
 
-    ###
     # make the plots
     axs = []
     import itertools
@@ -252,7 +251,7 @@ def embedding(
             na_color=na_color,
         )
 
-        ### Order points
+        # Order points
         order = slice(None)
         if sort_order is True and value_to_plot is not None and categorical is False:
             # Higher values plotted on top, null values on bottom
@@ -470,7 +469,6 @@ def _get_vmin_vmax(
     index: int,
     color_vector: Sequence[float],
 ) -> Tuple[Union[float, None], Union[float, None]]:
-
     """
     Evaluates the value of vmin and vmax, which could be a
     str in which case is interpreted as a percentile and should
@@ -730,7 +728,7 @@ def pca(
         )
     else:
 
-        if 'pca' not in adata.obsm.keys() and f"X_pca" not in adata.obsm.keys():
+        if 'pca' not in adata.obsm.keys() and 'X_pca' not in adata.obsm.keys():
             raise KeyError(
                 f"Could not find entry in `obsm` for 'pca'.\n"
                 f"Available keys are: {list(adata.obsm.keys())}."
@@ -977,7 +975,7 @@ def _get_data_points(
             data_points = []
             for comp in components_list:
                 data_points.append(adata.obsm[basis_key][:, comp])
-        except:
+        except Exception:  # TODO catch the correct exception
             raise ValueError(
                 "Given components: '{}' are not valid. Please check. "
                 "A valid example is `components='2,3'`"
diff --git a/scanpy/plotting/_utils.py b/scanpy/plotting/_utils.py
index f6cb8886ba..29f55d28d3 100644
--- a/scanpy/plotting/_utils.py
+++ b/scanpy/plotting/_utils.py
@@ -126,7 +126,7 @@ def timeseries_subplot(
     else:
         levels, _ = np.unique(color, return_inverse=True)
         colors = np.array(palette[: len(levels)].by_key()['color'])
-        subsets = [(x_range[color == l], X[color == l, :]) for l in levels]
+        subsets = [(x_range[color == level], X[color == level, :]) for level in levels]
 
     if ax is None:
         ax = pl.subplot()
@@ -619,7 +619,9 @@ def setup_axes(
     figure_width = width_without_offsets + left_offset + right_offset
     draw_region_width_frac = draw_region_width / figure_width
     left_offset_frac = left_offset / figure_width
-    right_offset_frac = 1 - (len(panels) - 1) * left_offset_frac
+    right_offset_frac = (  # noqa: F841  # TODO Does this need fixing?
+        1 - (len(panels) - 1) * left_offset_frac
+    )
 
     if ax is None:
         pl.figure(
@@ -707,9 +709,7 @@ def scatter_base(
     )
     for icolor, color in enumerate(colors):
         ax = axs[icolor]
-        left = panel_pos[2][2 * icolor]
         bottom = panel_pos[0][0]
-        width = draw_region_width / figure_width
         height = panel_pos[1][0] - bottom
         Y_sort = Y
         if not is_color_like(color) and sort_order:
@@ -742,7 +742,7 @@ def scatter_base(
             rectangle = [left, bottom, width, height]
             fig = pl.gcf()
             ax_cb = fig.add_axes(rectangle)
-            cb = pl.colorbar(
+            _ = pl.colorbar(
                 sct, format=ticker.FuncFormatter(ticks_formatter), cax=ax_cb
             )
         # set the title
@@ -939,8 +939,8 @@ def make_pos(pos, node=root, currentLevel=0, parent=None, vert_loc=0):
     if levels is None:
         levels = make_levels({})
     else:
-        levels = {l: {TOTAL: levels[l], CURRENT: 0} for l in levels}
-    vert_gap = height / (max([l for l in levels]) + 1)
+        levels = {k: {TOTAL: v, CURRENT: 0} for k, v in levels.items()}
+    vert_gap = height / (max(levels.keys()) + 1)
     return make_pos({})
 
 
diff --git a/scanpy/plotting/palettes.py b/scanpy/plotting/palettes.py
index 7086549a4c..3ea963de99 100644
--- a/scanpy/plotting/palettes.py
+++ b/scanpy/plotting/palettes.py
@@ -1,5 +1,6 @@
 """Color palettes in addition to matplotlib's palettes."""
 
+from typing import Mapping, Sequence
 from matplotlib import cm, colors
 
 # Colorblindness adjusted vega_10
@@ -180,9 +181,6 @@
 default_102 = godsnot_102
 
 
-from typing import Mapping, Sequence
-
-
 def _plot_color_cycle(clists: Mapping[str, Sequence[str]]):
     import numpy as np
     import matplotlib.pyplot as plt
diff --git a/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/scanpy/preprocessing/_deprecated/highly_variable_genes.py
index 570a5f9f25..88454f08f8 100644
--- a/scanpy/preprocessing/_deprecated/highly_variable_genes.py
+++ b/scanpy/preprocessing/_deprecated/highly_variable_genes.py
@@ -170,7 +170,8 @@ def filter_genes_dispersion(
         disp_mean_bin[one_gene_per_bin] = 0
         # actually do the normalization
         df['dispersion_norm'] = (
-            df['dispersion'].values  # use values here as index differs
+            # use values here as index differs
+            df['dispersion'].values
             - disp_mean_bin[df['mean_bin'].values].values
         ) / disp_std_bin[df['mean_bin'].values].values
     elif flavor == 'cell_ranger':
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index dec4449534..17a36d6520 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -57,6 +57,7 @@ def _highly_variable_genes_seurat_v3(
         )
 
     X = adata.layers[layer] if layer is not None else adata.X
+
     if check_nonnegative_integers(X) is False:
         raise ValueError(
             "`pp.highly_variable_genes` with `flavor='seurat_v3'` expects "
@@ -194,7 +195,6 @@ def _highly_variable_genes_single_batch(
     A DataFrame that contains the columns
     `highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
     """
-
     X = adata.layers[layer] if layer is not None else adata.X
     if flavor == 'seurat':
         if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None:
@@ -265,7 +265,7 @@ def _highly_variable_genes_single_batch(
             ::-1
         ].sort()  # interestingly, np.argpartition is slightly slower
         if n_top_genes > adata.n_vars:
-            logg.info(f'`n_top_genes` > `adata.n_var`, returning all genes.')
+            logg.info('`n_top_genes` > `adata.n_var`, returning all genes.')
             n_top_genes = adata.n_vars
         disp_cut_off = dispersion_norm[n_top_genes - 1]
         gene_subset = np.nan_to_num(df['dispersions_norm'].values) >= disp_cut_off
diff --git a/scanpy/preprocessing/_pca.py b/scanpy/preprocessing/_pca.py
index 9ab94cfae2..fc4ec679cc 100644
--- a/scanpy/preprocessing/_pca.py
+++ b/scanpy/preprocessing/_pca.py
@@ -113,7 +113,7 @@ def pca(
              Explained variance, equivalent to the eigenvalues of the
              covariance matrix.
     """
-    logg_start = logg.info(f'computing PCA')
+    logg_start = logg.info('computing PCA')
 
     # chunked calculation is not randomized, anyways
     if svd_solver in {'auto', 'randomized'} and not chunked:
diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py
index b10f0bfc84..866e8e3183 100644
--- a/scanpy/preprocessing/_simple.py
+++ b/scanpy/preprocessing/_simple.py
@@ -551,7 +551,7 @@ def normalize_per_cell(
     # proceed with data matrix
     X = data.copy() if copy else data
     if counts_per_cell is None:
-        if copy == False:
+        if not copy:
             raise ValueError('Can only be run with copy=True')
         cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts)
         X = X[cell_subset]
@@ -752,7 +752,11 @@ def scale(
     annotated with `'mean'` and `'std'` in `adata.var`.
     """
     _check_array_function_arguments(layer=layer, obsm=obsm)
-    return scale_array(data, zero_center=zero_center, max_value=max_value, copy=copy)
+    if layer is not None:
+        raise ValueError(f"`layer` argument inappropriate for value of type {type(X)}")
+    if obsm is not None:
+        raise ValueError(f"`obsm` argument inappropriate for value of type {type(X)}")
+    return scale_array(X, zero_center=zero_center, max_value=max_value, copy=copy)
 
 
 @scale.register(np.ndarray)
diff --git a/scanpy/readwrite.py b/scanpy/readwrite.py
index 96c9730f67..87d2055fc4 100644
--- a/scanpy/readwrite.py
+++ b/scanpy/readwrite.py
@@ -815,9 +815,9 @@ def _read_softgz(filename: Union[str, bytes, Path, BinaryIO]) -> AnnData:
         # Next line is the column headers (sample id's)
         sample_names = file.readline().strip().split("\t")
         # The column indices that contain gene expression data
-        I = [i for i, x in enumerate(sample_names) if x.startswith("GSM")]
+        indices = [i for i, x in enumerate(sample_names) if x.startswith("GSM")]
         # Restrict the column headers to those that we keep
-        sample_names = [sample_names[i] for i in I]
+        sample_names = [sample_names[i] for i in indices]
         # Get a list of sample labels
         groups = [samples_info[k] for k in sample_names]
         # Read the gene expression data as a list of lists, also get the gene
@@ -831,7 +831,7 @@ def _read_softgz(filename: Union[str, bytes, Path, BinaryIO]) -> AnnData:
             V = line.split("\t")
             # Extract the values that correspond to gene expression measures
             # and convert the strings to numbers
-            x = [float(V[i]) for i in I]
+            x = [float(V[i]) for i in indices]
             X.append(x)
             gene_names.append(V[1])
     # Convert the Python list of lists to a Numpy array and transpose to match
@@ -914,7 +914,7 @@ def get_used_files():
                 filenames.append(nt.path)
         # This catches a race condition where a process ends
         # before we can examine its files
-        except psutil.NoSuchProcess as err:
+        except psutil.NoSuchProcess:
             pass
     return set(filenames)
 
diff --git a/scanpy/tests/conftest.py b/scanpy/tests/conftest.py
index a948488373..667354c591 100644
--- a/scanpy/tests/conftest.py
+++ b/scanpy/tests/conftest.py
@@ -10,6 +10,7 @@
 
 import scanpy
 
+
 scanpy.settings.verbosity = "hint"
 
 # define this after importing scanpy but before running tests
diff --git a/scanpy/tests/external/test_scrublet.py b/scanpy/tests/external/test_scrublet.py
index 83e15d9ca4..e09a3702c0 100644
--- a/scanpy/tests/external/test_scrublet.py
+++ b/scanpy/tests/external/test_scrublet.py
@@ -16,8 +16,6 @@ def test_scrublet():
     adata = sc.datasets.pbmc3k()
     sce.pp.scrublet(adata, use_approx_neighbors=False)
 
-    errors = []
-
     # replace assertions by conditions
     assert "predicted_doublet" in adata.obs.columns
     assert "doublet_score" in adata.obs.columns
@@ -36,8 +34,6 @@ def test_scrublet_dense():
     adata = sc.datasets.paul15()[:500].copy()
     sce.pp.scrublet(adata, use_approx_neighbors=False)
 
-    errors = []
-
     # replace assertions by conditions
     assert "predicted_doublet" in adata.obs.columns
     assert "doublet_score" in adata.obs.columns
@@ -102,7 +98,7 @@ def test_scrublet_simulate_doublets():
     sc.pp.normalize_total(adata_obs)
     logged = sc.pp.log1p(adata_obs, copy=True)
 
-    hvg = sc.pp.highly_variable_genes(logged)
+    _ = sc.pp.highly_variable_genes(logged)
     adata_obs = adata_obs[:, logged.var['highly_variable']]
 
     adata_sim = sce.pp.scrublet_simulate_doublets(adata_obs, layer='raw')
diff --git a/scanpy/tests/test_embedding_plots.py b/scanpy/tests/test_embedding_plots.py
index 841ff63a8c..49746e45c3 100644
--- a/scanpy/tests/test_embedding_plots.py
+++ b/scanpy/tests/test_embedding_plots.py
@@ -200,7 +200,7 @@ def test_enumerated_palettes(fixture_request, adata, tmpdir, plotfunc):
     check_images(dict_pth, list_pth, tol=15)
 
 
-## Spatial specific
+# Spatial specific
 
 
 def test_visium_circles(image_comparer):  # standard visium data
diff --git a/scanpy/tests/test_logging.py b/scanpy/tests/test_logging.py
index a954d85421..0da37040bc 100644
--- a/scanpy/tests/test_logging.py
+++ b/scanpy/tests/test_logging.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from scanpy import Verbosity, settings as s, logging as l
+from scanpy import Verbosity, settings as s, logging as log
 import scanpy as sc
 
 
@@ -24,29 +24,29 @@ def test_defaults():
 def test_formats(capsys, logging_state):
     s.logfile = sys.stderr
     s.verbosity = Verbosity.debug
-    l.error('0')
+    log.error('0')
     assert capsys.readouterr().err == 'ERROR: 0\n'
-    l.warning('1')
+    log.warning('1')
     assert capsys.readouterr().err == 'WARNING: 1\n'
-    l.info('2')
+    log.info('2')
     assert capsys.readouterr().err == '2\n'
-    l.hint('3')
+    log.hint('3')
     assert capsys.readouterr().err == '--> 3\n'
-    l.debug('4')
+    log.debug('4')
     assert capsys.readouterr().err == '    4\n'
 
 
 def test_deep(capsys, logging_state):
     s.logfile = sys.stderr
     s.verbosity = Verbosity.hint
-    l.hint('0')
+    log.hint('0')
     assert capsys.readouterr().err == '--> 0\n'
-    l.hint('1', deep='1!')
+    log.hint('1', deep='1!')
     assert capsys.readouterr().err == '--> 1\n'
     s.verbosity = Verbosity.debug
-    l.hint('2')
+    log.hint('2')
     assert capsys.readouterr().err == '--> 2\n'
-    l.hint('3', deep='3!')
+    log.hint('3', deep='3!')
     assert capsys.readouterr().err == '--> 3: 3!\n'
 
 
@@ -57,15 +57,15 @@ def test_logfile(tmp_path, logging_state):
     s.logfile = io
     assert s.logfile is io
     assert s.logpath is None
-    l.error('test!')
+    log.error('test!')
     assert io.getvalue() == 'ERROR: test!\n'
 
     p = tmp_path / 'test.log'
     s.logpath = p
     assert s.logpath == p
     assert s.logfile.name == str(p)
-    l.hint('test2')
-    l.debug('invisible')
+    log.hint('test2')
+    log.debug('invisible')
     assert s.logpath.read_text() == '--> test2\n'
 
 
@@ -80,18 +80,18 @@ def now(tz):
             counter += 1
             return datetime(2000, 1, 1, second=counter, microsecond=counter, tzinfo=tz)
 
-    monkeypatch.setattr(l, 'datetime', IncTime)
+    monkeypatch.setattr(log, 'datetime', IncTime)
     s.verbosity = Verbosity.debug
 
-    l.hint('1')
+    log.hint('1')
     assert counter == 1 and capsys.readouterr().err == '--> 1\n'
-    start = l.info('2')
+    start = log.info('2')
     assert counter == 2 and capsys.readouterr().err == '2\n'
-    l.hint('3')
+    log.hint('3')
     assert counter == 3 and capsys.readouterr().err == '--> 3\n'
-    l.info('4', time=start)
+    log.info('4', time=start)
     assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n'
-    l.info('5 {time_passed}', time=start)
+    log.info('5 {time_passed}', time=start)
     assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n'
 
 
diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py
index 36b6fab362..7d7837a9a0 100644
--- a/scanpy/tests/test_pca.py
+++ b/scanpy/tests/test_pca.py
@@ -1,8 +1,6 @@
 import pytest
 import numpy as np
 from anndata import AnnData
-from scipy.sparse import csr_matrix
-from scipy import sparse
 
 import scanpy as sc
 from scanpy.tests.fixtures import array_type, float_dtype
diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py
index 366402e23c..4773817fe1 100644
--- a/scanpy/tests/test_plotting.py
+++ b/scanpy/tests/test_plotting.py
@@ -435,7 +435,7 @@ def test_multiple_plots(image_comparer):
     fig, (ax1, ax2, ax3) = plt.subplots(
         1, 3, figsize=(20, 5), gridspec_kw={'wspace': 0.7}
     )
-    __ = sc.pl.stacked_violin(
+    _ = sc.pl.stacked_violin(
         adata,
         markers,
         groupby='bulk_labels',
@@ -444,7 +444,7 @@ def test_multiple_plots(image_comparer):
         dendrogram=True,
         show=False,
     )
-    __ = sc.pl.dotplot(
+    _ = sc.pl.dotplot(
         adata,
         markers,
         groupby='bulk_labels',
@@ -453,7 +453,7 @@ def test_multiple_plots(image_comparer):
         dendrogram=True,
         show=False,
     )
-    __ = sc.pl.matrixplot(
+    _ = sc.pl.matrixplot(
         adata,
         markers,
         groupby='bulk_labels',
diff --git a/scanpy/tests/test_preprocessing_distributed.py b/scanpy/tests/test_preprocessing_distributed.py
index b7d91a063e..871656c1ce 100644
--- a/scanpy/tests/test_preprocessing_distributed.py
+++ b/scanpy/tests/test_preprocessing_distributed.py
@@ -5,9 +5,9 @@
 import numpy.testing as npt
 import pytest
 
-from scanpy.preprocessing import *
-from scanpy.preprocessing._simple import materialize_as_ndarray
-
+from scanpy.preprocessing import normalize_total, filter_genes
+from scanpy.preprocessing import log1p, normalize_per_cell, filter_cells
+from scanpy.preprocessing._distributed import materialize_as_ndarray
 
 HERE = Path(__file__).parent / Path('_data/')
 input_file = str(Path(HERE, "10x-10k-subset.zarr"))
diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py
index 9a0bf835b2..4eae8bae94 100644
--- a/scanpy/tests/test_scaling.py
+++ b/scanpy/tests/test_scaling.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize('typ', [np.array, csr_matrix], ids=lambda x: x.__name__)
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 def test_scale(typ, dtype):
-    ## test AnnData arguments
+    # test AnnData arguments
     # test scaling with default zero_center == True
     adata0 = AnnData(typ(X), dtype=dtype)
     sc.pp.scale(adata0)
@@ -39,7 +39,7 @@ def test_scale(typ, dtype):
     adata2 = AnnData(typ(X), dtype=dtype)
     sc.pp.scale(adata2, zero_center=False)
     assert np.allclose(csr_matrix(adata2.X).toarray(), X_scaled)
-    ## test bare count arguments, for simplicity only with explicit copy=True
+    # test bare count arguments, for simplicity only with explicit copy=True
     # test scaling with default zero_center == True
     data0 = typ(X, dtype=dtype)
     cdata0 = sc.pp.scale(data0, copy=True)
diff --git a/scanpy/tools/_diffmap.py b/scanpy/tools/_diffmap.py
index 9fda35e2a7..e5b5b8b8f4 100644
--- a/scanpy/tools/_diffmap.py
+++ b/scanpy/tools/_diffmap.py
@@ -52,6 +52,13 @@ def diffmap(
     `diffmap_evals` : :class:`numpy.ndarray` (`adata.uns`)
         Array of size (number of eigen vectors).
         Eigenvalues of transition matrix.
+
+    Notes
+    -----
+    The 0-th column in `adata.obsm["X_diffmap"]` is the steady-state solution,
+    which is non-informative in diffusion maps.
+    Therefore, the first diffusion component is at index 1,
+    e.g. `adata.obsm["X_diffmap"][:,1]`
     """
     if neighbors_key is None:
         neighbors_key = 'neighbors'
diff --git a/scanpy/tools/_dpt.py b/scanpy/tools/_dpt.py
index f2060eeb9b..716987f726 100644
--- a/scanpy/tools/_dpt.py
+++ b/scanpy/tools/_dpt.py
@@ -342,7 +342,9 @@ def check_adjacency(self):
         for n_edges in range(1, np.max(n_edges_per_seg) + 1):
             for iseg in range(self.segs_adjacency.shape[0]):
                 if n_edges_per_seg[iseg] == n_edges:
-                    neighbor_segs = self.segs_adjacency[iseg].todense().A1
+                    neighbor_segs = (  # noqa: F841  TODO Evaluate whether to assign the variable or not
+                        self.segs_adjacency[iseg].todense().A1
+                    )
                     closest_points_other_segs = [
                         seg[np.argmin(self.distances_dpt[self.segs_tips[iseg][0], seg])]
                         for seg in self.segs
@@ -593,7 +595,9 @@ def detect_branching(
                 for iseg, seg_connects in enumerate(ssegs_connects)
                 if iseg != trunk
             ]
-            prev_connecting_points = segs_connects[iseg]
+            prev_connecting_points = segs_connects[  # noqa: F841  TODO Evaluate whether to assign the variable or not
+                iseg
+            ]
             for jseg_cnt, jseg in enumerate(prev_connecting_segments):
                 iseg_cnt = 0
                 for iseg_new, seg_new in enumerate(ssegs):
@@ -904,8 +908,8 @@ def _detect_branching_single_haghverdi16(self, Dseg, tips):
         ps = [
             [0, 1, 2],  # start by computing distances from the first tip
             [1, 2, 0],  #             -"-                       second tip
-            [2, 0, 1],
-        ]  #             -"-                       third tip
+            [2, 0, 1],  #             -"-                       third tip
+        ]
         for i, p in enumerate(ps):
             ssegs.append(self.__detect_branching_haghverdi16(Dseg, tips[p]))
         return ssegs
@@ -984,7 +988,7 @@ def __detect_branching_haghverdi16(
                 Dseg[tips[0]][idcs] + Dseg[tips[1]][idcs] + Dseg[tips[2]][idcs]
             )
         # init list to store new segments
-        ssegs = []
+        ssegs = []  # noqa: F841  # TODO Look into this
         # first new segment: all points until, but excluding the branching point
         # increasing the following slightly from imax is a more conservative choice
         # as the criterion based on normalized distances, which follows below,
diff --git a/scanpy/tools/_paga.py b/scanpy/tools/_paga.py
index dea2f6f552..135c0ec6f9 100644
--- a/scanpy/tools/_paga.py
+++ b/scanpy/tools/_paga.py
@@ -1,4 +1,3 @@
-from collections import namedtuple
 from typing import List, Optional, NamedTuple
 
 import numpy as np
diff --git a/scanpy/tools/_sim.py b/scanpy/tools/_sim.py
index 98e88f7d70..32f7099fa4 100644
--- a/scanpy/tools/_sim.py
+++ b/scanpy/tools/_sim.py
@@ -207,7 +207,6 @@ def sample_dynamic_data(**params):
         step = 5
 
         grnsim = GRNsim(dim=dim, initType=initType, model=model_key, params=params)
-        curr_nrSamples = 0
         Xsamples = []
         for sample in range(maxNrSamples):
             # choose initial conditions such that branchings result
@@ -410,9 +409,8 @@ def __init__(
         # checks
         if initType not in ['branch', 'random']:
             raise RuntimeError('initType must be either: branch, random')
-        read = False
         if model not in self.availModels.keys():
-            message = 'model not among predefined models \n'
+            message = 'model not among predefined models \n'  # noqa: F841  # TODO FIX
         # read from file
         from .. import sim_models
 
@@ -772,13 +770,12 @@ def branch_init_model1(self, tmax=100):
         if np.min(X0mean) < 0.025 or np.max(X0mean) > 0.975:
             settings.m(0, '... initial point is too close to bounds')
             return None
-        #
         if self.show and self.verbosity > 1:
-            pl.figure()
-            pl.plot(XbackUp[:, 0], '.b', XbackUp[:, 1], '.g')
-            pl.plot(XbackDo[:, 0], '.b', XbackDo[:, 1], '.g')
-            pl.plot(Xup[:, 0], 'b', Xup[:, 1], 'g')
-            pl.plot(Xdo[:, 0], 'b', Xdo[:, 1], 'g')
+            pl.figure()  # noqa: F821  TODO Fix me
+            pl.plot(XbackUp[:, 0], '.b', XbackUp[:, 1], '.g')  # noqa: F821  TODO Fix me
+            pl.plot(XbackDo[:, 0], '.b', XbackDo[:, 1], '.g')  # noqa: F821  TODO Fix me
+            pl.plot(Xup[:, 0], 'b', Xup[:, 1], 'g')  # noqa: F821  TODO Fix me
+            pl.plot(Xdo[:, 0], 'b', Xdo[:, 1], 'g')  # noqa: F821  TODO Fix me
         return X0mean
 
     def parents_from_boolRule(self, rule):
@@ -845,7 +842,7 @@ def build_boolCoeff(self):
             if self.verbosity > 1:
                 settings.m(0, '...' + key)
                 settings.m(0, rule)
-                settings.m(0, rule_pa)
+                settings.m(0, rule_pa)  # noqa: F821
             # now evaluate coefficients
             for tuple in list(
                 itertools.product([False, True], repeat=len(self.pas[key]))
@@ -1078,7 +1075,7 @@ def sim_givenAdj(self, Adj: np.ndarray, model='line'):
         Data array of shape (n_samples,dim).
         """
         # nice examples
-        examples = [
+        examples = [  # noqa: F841 TODO We are really unsure whether this is needed.
             dict(
                 func='sawtooth',
                 gdist='uniform',
@@ -1174,11 +1171,13 @@ def sim_combi(self):
         # AND type / horizontal
         X[:, 2] = func(X[:, 0]) * sp.stats.norm.cdf(X[:, 1], 1, 0.2)
 
-        pl.scatter(X[:, 0], X[:, 1], c=X[:, 2], edgecolor='face')
-        pl.show()
+        pl.scatter(  # noqa: F821  TODO Fix me
+            X[:, 0], X[:, 1], c=X[:, 2], edgecolor='face'
+        )
+        pl.show()  # noqa: F821  TODO Fix me
 
-        pl.plot(X[:, 1], X[:, 2], '.')
-        pl.show()
+        pl.plot(X[:, 1], X[:, 2], '.')  # noqa: F821  TODO Fix me
+        pl.show()  # noqa: F821  TODO Fix me
 
         return X
 
diff --git a/scanpy/tools/_top_genes.py b/scanpy/tools/_top_genes.py
index 439181b081..28a9390590 100644
--- a/scanpy/tools/_top_genes.py
+++ b/scanpy/tools/_top_genes.py
@@ -68,8 +68,8 @@ def correlation_matrix(
 
     # TODO: At the moment, only works for int identifiers
 
-    ### If no genes are passed, selects ranked genes from sample annotation.
-    ### At the moment, only calculate one table (Think about what comes next)
+    # If no genes are passed, selects ranked genes from sample annotation.
+    # At the moment, only calculate one table (Think about what comes next)
     if name_list is None:
         name_list = list()
         for j, k in enumerate(adata.uns['rank_genes_groups_gene_names']):
@@ -128,7 +128,7 @@ def ROC_AUC_analysis(
     Calculate correlation matrix.
 
     Calculate a correlation matrix for genes strored in sample annotation
-    
+
     Parameters
     ----------
     adata
@@ -159,7 +159,6 @@ def ROC_AUC_analysis(
     groups_order, groups_masks = select_groups(adata, groups, groupby)
 
     # Use usual convention, better for looping later.
-    imask = group
     mask = groups_masks[group]
 
     # TODO: Allow for sample weighting requires better mask access... later
@@ -192,11 +191,11 @@ def ROC_AUC_analysis(
 
 
 def subsampled_estimates(mask, mask_rest=None, precision=0.01, probability=0.99):
-    ## Simple method that can be called by rank_gene_group. It uses masks that have been passed to the function and
-    ## calculates how much has to be subsampled in order to reach a certain precision with a certain probability
-    ## Then it subsamples for mask, mask rest
-    ## Since convergence speed varies, we take the slower one, i.e. the variance. This might have future speed-up
-    ## potential
+    # Simple method that can be called by rank_gene_group. It uses masks that have been passed to the function and
+    # calculates how much has to be subsampled in order to reach a certain precision with a certain probability
+    # Then it subsamples for mask, mask rest
+    # Since convergence speed varies, we take the slower one, i.e. the variance. This might have future speed-up
+    # potential
     if mask_rest is None:
         mask_rest = ~mask
     # TODO: DO precision calculation for mean variance shared
@@ -205,16 +204,16 @@ def subsampled_estimates(mask, mask_rest=None, precision=0.01, probability=0.99)
 
 
 def dominated_ROC_elimination(adata, grouby):
-    ## This tool has the purpose to take a set of genes (possibly already pre-selected) and analyze AUC.
-    ## Those and only those are eliminated who are dominated completely
-    ## TODO: Potentially (But not till tomorrow), this can be adapted to only consider the AUC in the given
-    ## TODO: optimization frame
+    # This tool has the purpose to take a set of genes (possibly already pre-selected) and analyze AUC.
+    # Those and only those are eliminated who are dominated completely
+    # TODO: Potentially (But not till tomorrow), this can be adapted to only consider the AUC in the given
+    # TODO: optimization frame
     pass
 
 
 def _gene_preselection(adata, mask, thresholds):
-    ## This tool serves to
-    ## It is not thought to be addressed directly but rather using rank_genes_group or ROC analysis or comparable
-    ## TODO: Pass back a truncated adata object with only those genes that fullfill thresholding criterias
-    ## This function should be accessible by both rank_genes_groups and ROC_curve analysis
+    # This tool serves to
+    # It is not thought to be addressed directly but rather using rank_genes_group or ROC analysis or comparable
+    # TODO: Pass back a truncated adata object with only those genes that fullfill thresholding criterias
+    # This function should be accessible by both rank_genes_groups and ROC_curve analysis
     pass

From 2088ed3efb1ee16d5a45e2c10519cc1aedd125e2 Mon Sep 17 00:00:00 2001
From: Jan Lause <34481813+jlause@users.noreply.github.com>
Date: Fri, 19 Mar 2021 07:22:02 +0100
Subject: [PATCH 2/3] bugfix for #1725 (#1732)

* fixes #1725

* removed unneeded mean_fulldataset/var_fulldataset variables

* release note and tests

* making variance dtype explicit in test

* Minor cleanup -- ivirshup

Co-authored-by: Isaac Virshup <ivirshup@gmail.com>
---
 docs/release-notes/1.7.2.rst                  |  1 +
 .../preprocessing/_highly_variable_genes.py   | 42 +++++++++----------
 scanpy/tests/test_highly_variable_genes.py    | 22 ++++++++++
 3 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/docs/release-notes/1.7.2.rst b/docs/release-notes/1.7.2.rst
index 4432dcf9d6..034da2fb61 100644
--- a/docs/release-notes/1.7.2.rst
+++ b/docs/release-notes/1.7.2.rst
@@ -7,6 +7,7 @@
 
 - :func:`scanpy.logging.print_versions` now works when `python<3.8` :pr:`1691` :smaller:`I Virshup`
 - :func:`scanpy.pp.regress_out` now uses `joblib` as the parallel backend, and should stop oversubscribing threads :pr:`1694` :smaller:`I Virshup`
+- :func:`scanpy.pp.highly_variable_genes` with `flavor="seurat_v3"` now returns correct gene means and -variances when used with `batch_key` :pr:`1732` :smaller:`J Lause`
 
 .. rubric:: Deprecations
 
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 17a36d6520..4618f9beca 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -55,7 +55,7 @@ def _highly_variable_genes_seurat_v3(
         raise ImportError(
             'Please install skmisc package via `pip install --user scikit-misc'
         )
-
+    df = pd.DataFrame(index=adata.var_names)
     X = adata.layers[layer] if layer is not None else adata.X
 
     if check_nonnegative_integers(X) is False:
@@ -64,6 +64,8 @@ def _highly_variable_genes_seurat_v3(
             "raw count data."
         )
 
+    df['means'], df['variances'] = _get_mean_var(X)
+
     if batch_key is None:
         batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int))
     else:
@@ -71,13 +73,11 @@ def _highly_variable_genes_seurat_v3(
 
     norm_gene_vars = []
     for b in np.unique(batch_info):
+        X_batch = X[batch_info == b]
 
-        ad = adata[batch_info == b]
-        X = ad.layers[layer] if layer is not None else ad.X
-
-        mean, var = _get_mean_var(X)
+        mean, var = _get_mean_var(X_batch)
         not_const = var > 0
-        estimat_var = np.zeros(adata.shape[1], dtype=np.float64)
+        estimat_var = np.zeros(X.shape[1], dtype=np.float64)
 
         y = np.log10(var[not_const])
         x = np.log10(mean[not_const])
@@ -86,15 +86,18 @@ def _highly_variable_genes_seurat_v3(
         estimat_var[not_const] = model.outputs.fitted_values
         reg_std = np.sqrt(10 ** estimat_var)
 
-        batch_counts = X.astype(np.float64).copy()
+        batch_counts = X_batch.astype(np.float64).copy()
         # clip large values as in Seurat
-        N = np.sum(batch_info == b)
+        N = X_batch.shape[0]
         vmax = np.sqrt(N)
         clip_val = reg_std * vmax + mean
         if sp_sparse.issparse(batch_counts):
             batch_counts = sp_sparse.csr_matrix(batch_counts)
             mask = batch_counts.data > clip_val[batch_counts.indices]
             batch_counts.data[mask] = clip_val[batch_counts.indices[mask]]
+
+            squared_batch_counts_sum = np.array(batch_counts.power(2).sum(axis=0))
+            batch_counts_sum = np.array(batch_counts.sum(axis=0))
         else:
             clip_val_broad = np.broadcast_to(clip_val, batch_counts.shape)
             np.putmask(
@@ -103,10 +106,6 @@ def _highly_variable_genes_seurat_v3(
                 clip_val_broad,
             )
 
-        if sp_sparse.issparse(batch_counts):
-            squared_batch_counts_sum = np.array(batch_counts.power(2).sum(axis=0))
-            batch_counts_sum = np.array(batch_counts.sum(axis=0))
-        else:
             squared_batch_counts_sum = np.square(batch_counts).sum(axis=0)
             batch_counts_sum = batch_counts.sum(axis=0)
 
@@ -130,22 +129,21 @@ def _highly_variable_genes_seurat_v3(
     ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)
     median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)
 
-    df = pd.DataFrame(index=np.array(adata.var_names))
     df['highly_variable_nbatches'] = num_batches_high_var
     df['highly_variable_rank'] = median_ranked
     df['variances_norm'] = np.mean(norm_gene_vars, axis=0)
-    df['means'] = mean
-    df['variances'] = var
 
-    df.sort_values(
-        ['highly_variable_rank', 'highly_variable_nbatches'],
-        ascending=[True, False],
-        na_position='last',
-        inplace=True,
+    sorted_index = (
+        df[['highly_variable_rank', 'highly_variable_nbatches']]
+        .sort_values(
+            ['highly_variable_rank', 'highly_variable_nbatches'],
+            ascending=[True, False],
+            na_position='last',
+        )
+        .index
     )
     df['highly_variable'] = False
-    df.loc[: int(n_top_genes), 'highly_variable'] = True
-    df = df.loc[adata.var_names]
+    df.loc[sorted_index[: int(n_top_genes)], 'highly_variable'] = True
 
     if inplace or subset:
         adata.uns['hvg'] = {'flavor': 'seurat_v3'}
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 8b47d5357e..dfabf267db 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -238,3 +238,25 @@ def test_highly_variable_genes_batches():
     ]
 
     assert np.all(np.isin(colnames, hvg1.columns))
+
+
+from scanpy.preprocessing._utils import _get_mean_var
+
+
+def test_seurat_v3_mean_var_output_with_batchkey():
+    pbmc = sc.datasets.pbmc3k()
+    pbmc.var_names_make_unique()
+    n_cells = pbmc.shape[0]
+    batch = np.zeros((n_cells), dtype=int)
+    batch[1500:] = 1
+    pbmc.obs["batch"] = batch
+
+    # true_mean, true_var = _get_mean_var(pbmc.X)
+    true_mean = np.mean(pbmc.X.toarray(), axis=0)
+    true_var = np.var(pbmc.X.toarray(), axis=0, dtype=np.float64, ddof=1)
+
+    result_df = sc.pp.highly_variable_genes(
+        pbmc, batch_key='batch', flavor='seurat_v3', n_top_genes=4000, inplace=False
+    )
+    np.testing.assert_allclose(true_mean, result_df['means'], rtol=2e-05, atol=2e-05)
+    np.testing.assert_allclose(true_var, result_df['variances'], rtol=2e-05, atol=2e-05)

From 9b0d551df3f45b4f0f724ba92e89fc0e08f7c6da Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 19 Mar 2021 18:29:53 +1100
Subject: [PATCH 3/3] Run pre-commit over everything

---
 .flake8                     |  1 +
 scanpy/external/pp/_scvi.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.flake8 b/.flake8
index 63a2344286..555b6aecc2 100644
--- a/.flake8
+++ b/.flake8
@@ -34,4 +34,5 @@ exclude =
     build,
     docs/_build,
     dist,
+    scanpy/api/*,
 
diff --git a/scanpy/external/pp/_scvi.py b/scanpy/external/pp/_scvi.py
index baf47e684d..f4f75b0607 100644
--- a/scanpy/external/pp/_scvi.py
+++ b/scanpy/external/pp/_scvi.py
@@ -33,12 +33,12 @@ def scvi(
 
     Fits scVI model onto raw count data given an anndata object
 
-    scVI uses stochastic optimization and deep neural networks to aggregate information 
+    scVI uses stochastic optimization and deep neural networks to aggregate information
     across similar cells and genes and to approximate the distributions that underlie
     observed expression values, while accounting for batch effects and limited sensitivity.
 
     To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.),
-    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can 
+    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can
     be used to inspect which genes contribute to variation in the dataset. It may also be used
     for all scVI tasks, like differential expression, batch correction, imputation, etc.
     However, batch correction may be less powerful as it assumes a linear model.
@@ -69,13 +69,13 @@ def scvi(
     train_size
         The train size, either a float between 0 and 1 or an integer for the number of training samples to use
     batch_key
-        Column name in anndata.obs for batches. 
+        Column name in anndata.obs for batches.
         If None, no batch correction is performed
         If not None, batch correction is performed per batch category
     use_highly_variable_genes
         If true, uses only the genes in anndata.var["highly_variable"]
     subset_genes
-        Optional list of indices or gene names to subset anndata. 
+        Optional list of indices or gene names to subset anndata.
         If not None, use_highly_variable_genes is ignored
     linear_decoder
         If true, uses LDVAE model, which is an implementation of [Svensson20]_.
@@ -89,18 +89,18 @@ def scvi(
         Extra arguments for UnsupervisedTrainer
     model_kwargs
         Extra arguments for VAE or LDVAE model
-    
+
     Returns
     -------
     If `copy` is true, anndata is returned.
     If `return_posterior` is true, the posterior object is returned
-    If both `copy` and `return_posterior` are true, 
-    a tuple of anndata and the posterior are returned in that order. 
+    If both `copy` and `return_posterior` are true,
+    a tuple of anndata and the posterior are returned in that order.
 
     `adata.obsm['X_scvi']` stores the latent representations
     `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial
     `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial
-    
+
     If linear_decoder is true:
     `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a
     genes by n_latent matrix.