scverse · ivirshup · Mar 19, 2021 · Mar 18, 2021 · Mar 19, 2021 · Mar 19, 2021
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,38 @@
+# Can't yet be moved to the pyproject.toml due to https://gitlab.com/pycqa/flake8/-/issues/428#note_251982786
+[flake8]
+max-line-length = 88
+ignore = # module imported but unused -> required for Scanpys API
+         F401,
+         # line break before a binary operator -> black does not adhere to PEP8
+         W503,
+         # line break occured after a binary operator -> black does not adhere to PEP8
+         W504,
+         # line too long -> we accept long comment lines; black gets rid of long code lines
+         E501,
+         # whitespace before : -> black does not adhere to PEP8
+         E203,
+         # missing whitespace after ,', ';', or ':' -> black does not adhere to PEP8
+         E231,
+         # module level import not at top of file -> required to circumvent circular imports for Scanpys API
+         E402,
+         # continuation line over-indented for hanging indent -> black does not adhere to PEP8
+         E126,
+         # E266 too many leading '#' for block comment -> Scanpy allows them for comments into sections
+         E262,
+         # inline comment should start with '# ' -> Scanpy allows them for specific explanations
+         E266,
+         # Do not assign a lambda expression, use a def -> Scanpy allows lambda expression assignments,
+         E731,
+         # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation
+         E741
+ per-file-ignores =
+    # F811 Redefinition of unused name from line, does not play nice with pytest fixtures
+    tests/test*.py: F811
+exclude =
+    .git,
+    __pycache__,
+    build,
+    docs/_build,
+    dist,
+    scanpy/api/*,
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,3 +3,12 @@ repos:
     rev: 20.8b1
     hooks:
     -   id: black
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.4
+    hooks:
+    -   id: flake8
+-   repo: https://github.com/pre-commit/mirrors-autopep8
+    rev: v1.5.5
+    hooks:
+    -   id: autopep8
+        args: ["-i"]
diff --git a/docs/dev/code.rst b/docs/dev/code.rst
@@ -17,6 +17,11 @@ Code style
 
 New code should follow
 `Black <https://black.readthedocs.io/en/stable/the_black_code_style.html>`__
-and Scanpy’s
+and
+`flake8 <https://flake8.pycqa.org>`__.
+We ignore a couple of flake8 checks which are documented in the .flake8 file in the root of this repository.
+To learn how to ignore checks per line please read
+`flake8 violations <https://flake8.pycqa.org/en/latest/user/violations.html>`__.
+Additionally, we use Scanpy’s
 `EditorConfig <https://github.com/theislab/scanpy/blob/master/.editorconfig>`__,
 so using an editor/IDE with support for both is helpful.
diff --git a/docs/release-notes/1.7.2.rst b/docs/release-notes/1.7.2.rst
@@ -7,6 +7,7 @@
 
 - :func:`scanpy.logging.print_versions` now works when `python<3.8` :pr:`1691` :smaller:`I Virshup`
 - :func:`scanpy.pp.regress_out` now uses `joblib` as the parallel backend, and should stop oversubscribing threads :pr:`1694` :smaller:`I Virshup`
+- :func:`scanpy.pp.highly_variable_genes` with `flavor="seurat_v3"` now returns correct gene means and -variances when used with `batch_key` :pr:`1732` :smaller:`J Lause`
 
 .. rubric:: Deprecations
 

diff --git a/scanpy/_utils.py b/scanpy/_utils.py
@@ -209,7 +209,7 @@ def get_igraph_from_adjacency(adjacency, directed=None):
     g.add_edges(list(zip(sources, targets)))
     try:
         g.es['weight'] = weights
-    except:
+    except KeyError:
         pass
     if g.vcount() != adjacency.shape[0]:
         logg.warning(
@@ -551,7 +551,9 @@ def warn_with_traceback(message, category, filename, lineno, file=None, line=Non
     import traceback
 
     traceback.print_stack()
-    log = file if hasattr(file, 'write') else sys.stderr
+    log = (  # noqa: F841  # TODO Does this need fixing?
+        file if hasattr(file, 'write') else sys.stderr
+    )
     settings.write(warnings.formatwarning(message, category, filename, lineno, line))
 
 

diff --git a/scanpy/external/pl.py b/scanpy/external/pl.py
@@ -332,15 +332,15 @@ def scrublet_score_distribution(
     figsize: Optional[Tuple[float, float]] = (8, 3),
 ):
     """\
-    Plot histogram of doublet scores for observed transcriptomes and simulated doublets. 
+    Plot histogram of doublet scores for observed transcriptomes and simulated doublets.
+
+    The histogram for simulated doublets is useful for determining the correct doublet
+    score threshold.
 
-    The histogram for simulated doublets is useful for determining the correct doublet 
-    score threshold. 
-
     Parameters
     ----------
     adata
-        An annData object resulting from func:`~scanpy.external.scrublet`.  
+        An annData object resulting from func:`~scanpy.external.scrublet`.
     scale_hist_obs
         Set y axis scale transformation in matplotlib for the plot of observed
         transcriptomes (e.g. "linear", "log", "symlog", "logit")
@@ -353,9 +353,9 @@ def scrublet_score_distribution(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs
-        preprocessing, doublet simulation (this function) and calling. 
+        preprocessing, doublet simulation (this function) and calling.
     :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet
-        simulation separately for advanced usage. 
+        simulation separately for advanced usage.
     """
 
     threshold = adata.uns['scrublet']['threshold']

diff --git a/scanpy/external/pp/_scrublet.py b/scanpy/external/pp/_scrublet.py
@@ -1,5 +1,5 @@
 from anndata import AnnData
-from typing import Collection, Tuple, Optional, Union
+from typing import Optional
 import numpy as np
 from scipy import sparse
 
@@ -40,7 +40,7 @@ def scrublet(
     and directly call functions of Scrublet(). You may also undertake your own
     preprocessing, simulate doublets with
     scanpy.external.pp.scrublet_simulate_doublets(), and run the core scrublet
-    function scanpy.external.pp.scrublet.scrublet(). 
+    function scanpy.external.pp.scrublet.scrublet().
 
     .. note::
         More information and bug reports `here
@@ -61,7 +61,7 @@ def scrublet(
         as adata. This should have been built from adata_obs after
         filtering genes and cells and selcting highly-variable genes.
     sim_doublet_ratio
-        Number of doublets to simulate relative to the number of observed 
+        Number of doublets to simulate relative to the number of observed
         transcriptomes.
     expected_doublet_rate
         Where adata_sim not suplied, the estimated doublet rate for the
@@ -73,8 +73,8 @@ def scrublet(
         synthetic doublets. If 1.0, each doublet is created by simply adding
         the UMI counts from two randomly sampled observed transcriptomes. For
         values less than 1, the UMI counts are added and then randomly sampled
-        at the specified rate. 
-    knn_dist_metric 
+        at the specified rate.
+    knn_dist_metric
         Distance metric used when finding nearest neighbors. For list of
         valid values, see the documentation for annoy (if `use_approx_neighbors`
         is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
@@ -90,16 +90,16 @@ def scrublet(
         If True, center the data such that each gene has a mean of 0.
         `sklearn.decomposition.PCA` will be used for dimensionality
         reduction.
-    n_prin_comps 
+    n_prin_comps
         Number of principal components used to embed the transcriptomes prior
-        to k-nearest-neighbor graph construction. 
+        to k-nearest-neighbor graph construction.
     use_approx_neighbors
-        Use approximate nearest neighbor method (annoy) for the KNN 
+        Use approximate nearest neighbor method (annoy) for the KNN
         classifier.
     get_doublet_neighbor_parents
         If True, return (in .uns) the parent transcriptomes that generated the
         doublet neighbors of each observed transcriptome. This information can
-        be used to infer the cell states that generated a given doublet state. 
+        be used to infer the cell states that generated a given doublet state.
     n_neighbors
         Number of neighbors used to construct the KNN graph of observed
         transcriptomes and simulated doublets. If ``None``, this is
@@ -133,7 +133,7 @@ def scrublet(
         ``adata.uns['scrublet']['doublet_scores_sim']``
             Doublet scores for each simulated doublet transcriptome
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet
             transcriptome
 
@@ -143,9 +143,9 @@ def scrublet(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet
-        simulation separately for advanced usage. 
+        simulation separately for advanced usage.
     :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
-        scores for observed transcriptomes and simulated doublets. 
+        scores for observed transcriptomes and simulated doublets.
     """
     try:
         import scrublet as sl
@@ -185,7 +185,7 @@ def scrublet(
             pp.highly_variable_genes(adata_obs, subset=True)
         else:
             logged = pp.log1p(adata_obs, copy=True)
-            hvg = pp.highly_variable_genes(logged)
+            _ = pp.highly_variable_genes(logged)
             adata_obs = adata_obs[:, logged.var['highly_variable']]
 
         # Simulate the doublets based on the raw expressions from the normalised
@@ -257,7 +257,7 @@ def _scrublet_call_doublets(
     transcriptomes and simulated doublets. This is a wrapper around the core
     functions of `Scrublet <https://github.com/swolock/scrublet>`__ to allow
     for flexibility in applying Scanpy filtering operations upstream. Unless
-    you know what you're doing you should use the main scrublet() function.    
+    you know what you're doing you should use the main scrublet() function.
 
     .. note::
         More information and bug reports `here
@@ -293,20 +293,20 @@ def _scrublet_call_doublets(
         reduction, unless `mean_center` is True.
     n_prin_comps
         Number of principal components used to embed the transcriptomes prior
-        to k-nearest-neighbor graph construction. 
+        to k-nearest-neighbor graph construction.
     use_approx_neighbors
-        Use approximate nearest neighbor method (annoy) for the KNN 
+        Use approximate nearest neighbor method (annoy) for the KNN
         classifier.
     knn_dist_metric
         Distance metric used when finding nearest neighbors. For list of
         valid values, see the documentation for annoy (if `use_approx_neighbors`
         is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
         is False).
     get_doublet_neighbor_parents
-        If True, return the parent transcriptomes that generated the 
-        doublet neighbors of each observed transcriptome. This information can 
-        be used to infer the cell states that generated a given 
-        doublet state. 
+        If True, return the parent transcriptomes that generated the
+        doublet neighbors of each observed transcriptome. This information can
+        be used to infer the cell states that generated a given
+        doublet state.
     threshold
         Doublet score threshold for calling a transcriptome a doublet. If
         `None`, this is set automatically by looking for the minimum between
@@ -316,7 +316,7 @@ def _scrublet_call_doublets(
         predicted doublets in a 2-D embedding.
     random_state
         Initial state for doublet simulation and nearest neighbors.
-    verbose 
+    verbose
         If True, print progress updates.
 
     Returns
@@ -333,7 +333,7 @@ def _scrublet_call_doublets(
         ``adata.uns['scrublet']['doublet_scores_sim']``
             Doublet scores for each simulated doublet transcriptome
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome
 
         ``uns['scrublet']['parameters']``
@@ -453,16 +453,16 @@ def scrublet_simulate_doublets(
         The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows
         correspond to cells and columns to genes. Genes should have been
         filtered for expression and variability, and the object should contain
-        raw expression of the same dimensions. 
+        raw expression of the same dimensions.
     layer
-        Layer of adata where raw values are stored, or 'X' if values are in .X. 
+        Layer of adata where raw values are stored, or 'X' if values are in .X.
     sim_doublet_ratio
-        Number of doublets to simulate relative to the number of observed 
+        Number of doublets to simulate relative to the number of observed
         transcriptomes. If `None`, self.sim_doublet_ratio is used.
     synthetic_doublet_umi_subsampling
-        Rate for sampling UMIs when creating synthetic doublets. If 1.0, 
-        each doublet is created by simply adding the UMIs from two randomly 
-        sampled observed transcriptomes. For values less than 1, the 
+        Rate for sampling UMIs when creating synthetic doublets. If 1.0,
+        each doublet is created by simply adding the UMIs from two randomly
+        sampled observed transcriptomes. For values less than 1, the
         UMI counts are added and then randomly sampled at the specified
         rate.
 
@@ -471,7 +471,7 @@ def scrublet_simulate_doublets(
     adata : anndata.AnnData with simulated doublets in .X
         if ``copy=True`` it returns or else adds fields to ``adata``:
 
-        ``adata.uns['scrublet']['doublet_parents']`` 
+        ``adata.uns['scrublet']['doublet_parents']``
             Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome
 
         ``uns['scrublet']['parameters']``
@@ -480,9 +480,9 @@ def scrublet_simulate_doublets(
     See also
     --------
     :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs
-        preprocessing, doublet simulation (this function) and calling. 
+        preprocessing, doublet simulation (this function) and calling.
     :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
-        scores for observed transcriptomes and simulated doublets. 
+        scores for observed transcriptomes and simulated doublets.
     """
     try:
         import scrublet as sl

diff --git a/scanpy/external/pp/_scvi.py b/scanpy/external/pp/_scvi.py
@@ -33,12 +33,12 @@ def scvi(
 
     Fits scVI model onto raw count data given an anndata object
 
-    scVI uses stochastic optimization and deep neural networks to aggregate information 
+    scVI uses stochastic optimization and deep neural networks to aggregate information
     across similar cells and genes and to approximate the distributions that underlie
     observed expression values, while accounting for batch effects and limited sensitivity.
 
     To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.),
-    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can 
+    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can
     be used to inspect which genes contribute to variation in the dataset. It may also be used
     for all scVI tasks, like differential expression, batch correction, imputation, etc.
     However, batch correction may be less powerful as it assumes a linear model.
@@ -69,13 +69,13 @@ def scvi(
     train_size
         The train size, either a float between 0 and 1 or an integer for the number of training samples to use
     batch_key
-        Column name in anndata.obs for batches. 
+        Column name in anndata.obs for batches.
         If None, no batch correction is performed
         If not None, batch correction is performed per batch category
     use_highly_variable_genes
         If true, uses only the genes in anndata.var["highly_variable"]
     subset_genes
-        Optional list of indices or gene names to subset anndata. 
+        Optional list of indices or gene names to subset anndata.
         If not None, use_highly_variable_genes is ignored
     linear_decoder
         If true, uses LDVAE model, which is an implementation of [Svensson20]_.
@@ -89,18 +89,18 @@ def scvi(
         Extra arguments for UnsupervisedTrainer
     model_kwargs
         Extra arguments for VAE or LDVAE model
-    
+
     Returns
     -------
     If `copy` is true, anndata is returned.
     If `return_posterior` is true, the posterior object is returned
-    If both `copy` and `return_posterior` are true, 
-    a tuple of anndata and the posterior are returned in that order. 
+    If both `copy` and `return_posterior` are true,
+    a tuple of anndata and the posterior are returned in that order.
 
     `adata.obsm['X_scvi']` stores the latent representations
     `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial
     `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial
-    
+
     If linear_decoder is true:
     `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a
     genes by n_latent matrix.

diff --git a/scanpy/external/tl/_trimap.py b/scanpy/external/tl/_trimap.py
@@ -76,7 +76,7 @@ def trimap(
 
     Example
     -------
-    
+
     >>> import scanpy as sc
     >>> import scanpy.external as sce
     >>> pbmc = sc.datasets.pbmc68k_reduced()
-Original file line number
+Diff line change
@@ Expand Up / @@ -76,7 +76,7 @@ def trimap( @@
         Example
         -------
         >>> import scanpy as sc
         >>> import scanpy.external as sce
         >>> pbmc = sc.datasets.pbmc68k_reduced()
@@ Expand Down @@