From 3c7e0e2fd106f2343ffdfa096c81a4b33da61860 Mon Sep 17 00:00:00 2001 From: bio-la Date: Tue, 28 Nov 2023 10:23:11 +0100 Subject: [PATCH 1/9] added workflow preprocess info --- docs/workflows/preprocess.md | 17 ++++++++++------- .../panpipes/pipeline_preprocess/pipeline.yml | 2 +- panpipes/python_scripts/run_scanpyQC_prot.py | 6 +++--- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/workflows/preprocess.md b/docs/workflows/preprocess.md index b8ed42c8..817bcfe1 100644 --- a/docs/workflows/preprocess.md +++ b/docs/workflows/preprocess.md @@ -4,7 +4,7 @@ Preprocessing ## Pipeline steps -The preprocess pipeline filters the data as defined in the [filtering dictionary](../usage/filter_dict_instructions.md) section of the `pipeline.yml`. The data can also been downsampled. +The preprocess pipeline filters the data as defined in the [filtering dictionary](../usage/filter_dict_instructions.md) section of the `pipeline.yml`. The data can also been downsampled to a defined number of cells. Then each modality is normalised and scaled. For the RNA this is normalising counts per cell with [scanpy.pp.normalize_total](https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.normalize_total.html) and optionally, regressing and scaling the data using scanpy functions. Highly variable genes (HVGs) are also calculated, and a PCA performed on those highly variable genes. There is an option to exclude specific genes from the HVGs e.g. HLA genes or BCR/TCR genes. These are specified in the same way as all [gene lists](../usage/gene_list_format). In the example below, the "group" in the gene list file is "exclude". ``` hvg: @@ -12,10 +12,12 @@ hvg: exclude: "exclude" ``` -For Protein assay, the data are normalised either by centralised-log-ratio or by dsb as described in the muon documentation [here](https://muon.readthedocs.io/en/latest/omics/citeseq.html). There is additional panpipes functionality to trim dsb outliers as discussed on the dsb [github page](https://github.com/niaid/dsb/issues/9) +For Protein assay, the data are normalised either by centralised-log-ratio or by dsb as described in the muon documentation [here](https://muon.readthedocs.io/en/latest/omics/citeseq.html). There is additional panpipes functionality to trim dsb outliers as discussed on the dsb [github page](https://github.com/niaid/dsb/issues/9) dsb can only be run if the input data contains raw counts (the cellranger outs folder). +PCA is performed on the protein data, the number of components can be specified and is automatically adjusted to be `n_vars-1` when `n_pcs > n_vars` -For the ATAC assay .... +For the ATAC assay, the data are normalized either by standard normalization or with one of the TFIDF flavours included (see [normalization](https://panpipes-pipelines.readthedocs.io/en/latest/usage/normalization_methods.html)). +Then, dimensionality reduction is computed, either LSI or PCA with custom defined number of components. ## Steps to run: @@ -25,18 +27,19 @@ For the ATAC assay .... ``panpipes preprocess config`` 2. edit the pipeline.yml file - - The filtering options are dynamic depending on your qc_mm inputs. This is described [here](../usage/filter_dict_instructions.md) + - The filtering options are dynamic depending on your `ingest` inputs. This is described [here](../usage/filter_dict_instructions.md) - There are lots of options for normalisation explained in the - pipeline.yml + pipeline.yml and in [normalization](https://panpipes-pipelines.readthedocs.io/en/latest/usage/normalization_methods.html), + check the one that works for your data 3. Run complete preprocess pipeline with ``panpipes preprocess make full`` The h5mu outputted from ``preprocess`` is filtered and normalised, and -for rna highly variable genes are computed. +for rna and atac highly variable genes are computed. ## Expected structure of MuData object -The ideal way to run `panpipes preprocess` is to use the output mudata file from `panpipes qc_mm`, as this will make sure the MuData object has correctly names layers and slots. +The ideal way to run `panpipes preprocess` is to use the output mudata file from `panpipes ingest`, as this will make sure the MuData object has correctly names layers and slots. The bare minimum MuData object required is raw data in the X slot of each modality and a sample_id column the .obs slot of each of each modality, and the common (outer) obs. diff --git a/panpipes/panpipes/pipeline_preprocess/pipeline.yml b/panpipes/panpipes/pipeline_preprocess/pipeline.yml index 6d2cbbb1..e150978a 100644 --- a/panpipes/panpipes/pipeline_preprocess/pipeline.yml +++ b/panpipes/panpipes/pipeline_preprocess/pipeline.yml @@ -264,7 +264,7 @@ prot: # note that this feature is in the default muon mu.pp.dsb code, but manually implemented in this code. quantile_clipping: True - # which normalisation method to be store in the X slot. If you choose to run more than one normalisation method, + # which normalisation method to be stored in the X slot. If you choose to run more than one normalisation method, # which one to you want to store in the X slot, if not specified 'dsb' is the default when run. store_as_X: diff --git a/panpipes/python_scripts/run_scanpyQC_prot.py b/panpipes/python_scripts/run_scanpyQC_prot.py index 64a61879..1a9f4c50 100644 --- a/panpipes/python_scripts/run_scanpyQC_prot.py +++ b/panpipes/python_scripts/run_scanpyQC_prot.py @@ -95,10 +95,10 @@ per_cell_metrics = args.per_cell_metrics.split(",") per_cell_metrics = [a.strip() for a in per_cell_metrics] -# TODO: What happens if it is None? -# work out if we already have istype column, if not try to infer from index. + +# work out if we already have isotype column, if not try to infer from index. if 'isotype' not in prot.var.columns: # this means that isotype column was not included in the protein conversion table # so we are going to have a wwhack at identifying them @@ -123,7 +123,7 @@ percent_top=None,log1p=True, inplace=True) ## let's assess the isotype outlier cells. -#(Cells with an excessive amount of isotype indictaing stickiness) +#(Cells with an excessive amount of isotype indicating stickiness) if (len(isotypes) > 0) & check_for_bool(args.identify_isotype_outliers): L.info("identifying isotype outliers") # this measn we found some isotypes earlier From 4ac23aedf8d1a11e88c94aba7f4ef8e0b431e01e Mon Sep 17 00:00:00 2001 From: bio-la Date: Wed, 29 Nov 2023 13:34:47 +0100 Subject: [PATCH 2/9] added mouse tutorial --- docs/tutorials/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index d4f4258a..2902819f 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -1,7 +1,7 @@ Tutorials ========== -Check out the following tutorials which take you through some common analysis steps with Panpipes: +Check out the following tutorials which take you through common single cell multimodal analysis steps with Panpipes: - [Ingest workflow](https://panpipes-tutorials.readthedocs.io/en/latest/ingesting_data/Ingesting_data_with_panpipes.html) @@ -21,4 +21,5 @@ Spatial analysis: Additional tutorials: - [Ingesting multiome from cellranger outputs](https://panpipes-tutorials.readthedocs.io/en/latest/ingesting_multiome/ingesting_mome.html) +- [Ingesting mouse data](https://panpipes-tutorials.readthedocs.io/en/latest/ingesting_mouse/Ingesting_mouse_data_with_panpipes.html) From 1a29cec0e33c425fdfbdcd7401ea26f9687b79b0 Mon Sep 17 00:00:00 2001 From: bio-la Date: Wed, 29 Nov 2023 13:57:48 +0100 Subject: [PATCH 3/9] changes install file --- docs/install.md | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/docs/install.md b/docs/install.md index d0bbf780..fb9b7235 100644 --- a/docs/install.md +++ b/docs/install.md @@ -12,6 +12,7 @@ We create a conda environment with R and python Panpipes has a lot of dependencies, so you may want to consider [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) instead of `conda for installation. ``` +#This follows the suggestions made here: [https://www.biostars.org/p/498049/](https://www.biostars.org/p/498049/) conda config --add channels conda-forge conda config --set channel_priority strict # you should remove the strict priority afterwards! @@ -23,38 +24,32 @@ now we activate the environment ``` conda activate pipeline_env ``` - -This follows the suggestions made here: [https://www.biostars.org/p/498049/](https://www.biostars.org/p/498049/) - -Install specific dependencies +Panpipes requires the unix package `time`, in conda you can install it with: +You can check if it installed with `dpkg-query -W time`. If time not already installed, you can ``` -conda install -c conda-forge pynndescent +conda install time ``` +or -Install R packages ``` -conda install -c conda-forge r-tidyverse r-optparse r-ggforce r-ggraph r-xtable r-hdf5r r-clustree +apt-get install time ``` -Panpipes requires the unix package `time`, in conda you can install it with: -You can check if it installed with -``` -dpkg-query -W time -``` -if this is not already installed on your conda env with: +Install specific python dependencies ``` -conda install time +conda install -c conda-forge pynndescent ``` -or +Install R packages ``` -apt-get install time +conda install -c conda-forge r-tidyverse r-optparse r-ggforce r-ggraph r-xtable r-hdf5r r-clustree ``` + You can install `panpipes` directly from `PyPi` with: ``` From 9444f3e6400d5c9c1b279c2237bb39594ec32e88 Mon Sep 17 00:00:00 2001 From: bio-la Date: Wed, 29 Nov 2023 13:59:34 +0100 Subject: [PATCH 4/9] test pynndescent --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 99f9543a..2a38609d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "paramiko", "pep8", "pysam", + "pynndescent" "pytest", "pyyaml", "ruffus", From 11aaa5595193e0246b153bc0a6043c5dad614a3f Mon Sep 17 00:00:00 2001 From: bio-la Date: Thu, 30 Nov 2023 14:59:12 +0100 Subject: [PATCH 5/9] pynndescent test --- panpipes/.DS_Store | Bin 6148 -> 6148 bytes pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/panpipes/.DS_Store b/panpipes/.DS_Store index 1d92038220f8b8453a857c10e6663b21a0d435db..761faf99b038057a6308488476f323d2c15f864d 100644 GIT binary patch delta 334 zcmZoMXfc=|#>B)qu~2NHo+2ar#DLw44=^$^vQOq=Tu{%+P{feRP|T3ePzq!vgV;&s z#RW+@`AG~63@4Hbax#lc3=FO_GBLBTvaxe;aB*?*a>WK`cn3cQVcdI z7$U>L!O6)PFCboBZJ?uIY+z8Uqfl*VVglrtnw!?va&m~P8rpg$3 z7#Sfn13#38Q8R%IEM6_+4lc^e$B`mu~2NHo+2ab#DLw5tdn_|7HmGk?98(H0<#3uW_AvK4xp0F2bsS! VPv#e~g1?m;qz+5P1Lq diff --git a/pyproject.toml b/pyproject.toml index 2a38609d..e9cfed59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ "paramiko", "pep8", "pysam", - "pynndescent" + "pynndescent", "pytest", "pyyaml", "ruffus", From 149ac21aff5416adfcb98d2c6fca0a76413ba4dc Mon Sep 17 00:00:00 2001 From: bio-la Date: Fri, 1 Dec 2023 10:03:32 +0100 Subject: [PATCH 6/9] cleaned install --- docs/install.md | 81 ++++++++++++++++++++++----------------------- panpipes/.DS_Store | Bin 6148 -> 6148 bytes 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/docs/install.md b/docs/install.md index fb9b7235..55f69790 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,15 +1,15 @@ # Installation of panpipes -## Step 1: create virtual environment +### Create virtual environment We recommend running panpipes within a virtual environment to maintain reproducibility ### Option 1: create conda environment (Recommended) -We create a conda environment with R and python -Panpipes has a lot of dependencies, so you may want to consider [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) instead of `conda for installation. +To Run panpipes, we install it in a conda environment with R and python. +Panpipes has a lot of dependencies, so you may want to consider the faster [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) instead of `conda` for installation. ``` #This follows the suggestions made here: [https://www.biostars.org/p/498049/](https://www.biostars.org/p/498049/) @@ -24,31 +24,15 @@ now we activate the environment ``` conda activate pipeline_env ``` -Panpipes requires the unix package `time`, in conda you can install it with: -You can check if it installed with `dpkg-query -W time`. If time not already installed, you can - -``` -conda install time -``` -or -``` -apt-get install time -``` - - - -Install specific python dependencies - -``` -conda install -c conda-forge pynndescent -``` - -Install R packages +Let's first install the R packages ``` conda install -c conda-forge r-tidyverse r-optparse r-ggforce r-ggraph r-xtable r-hdf5r r-clustree ``` +Then we can install panpipes: + +#### 1. Installing panpipes from PyPi You can install `panpipes` directly from `PyPi` with: @@ -56,15 +40,14 @@ You can install `panpipes` directly from `PyPi` with: pip install panpipes ``` -If you intend to use panpies for spatial analysis, instead install: +If you intend to use panpipes for spatial analysis, instead install: ``` pip install 'panpipes[spatial]' ``` The extra `[spatial]` includes squidpy and cell2location packages. - -#### Nightly versions of panpipes. +#### 2. Nightly versions of panpipes. If you would prefer to use the most recent dev version, install from github @@ -74,9 +57,25 @@ cd panpipes pip install -e . ``` +------------ + +Panpipes requires the unix package `time`. +You can check if it installed with `dpkg-query -W time`. If time not already installed, you can + +``` +conda install time +``` +or + +``` +apt-get install time +``` + + + ### Option 2: python venv environment: -Navigate to where you want to create your virtual environment and follow the steps below to create a pip virtual environment +Navigate to where you want to create your virtual environment and follow the steps below to create a pip virtual environment ``` python3 -m venv --prompt=panpipes python3-venv-panpipes/ @@ -93,19 +92,21 @@ As explained in the conda installation, you can install `panpipes` with: ``` pip install panpipes ``` +or install a nightly version of panpipes cloning the github repo. -If you would prefer to use the most recent dev version, install from github +#### R packages installation in python venv -``` -git clone https://github.com/DendrouLab/panpipes -cd panpipes -pip install -e . -``` +If you are using a venv virtual environment, the pipeline will call a local R installation, so make sure R is installed and install the required packages with the command we provide below. +(This executable requires that you specify a CRAN mirror in your `.Rprofile`). +for example, add this line to your `.Rprofile` to automatically fetch the preferred mirror: +*remember to customise with your preferred [R mirror](https://cran.r-project.org/mirrors.html).* +``` + options(repos = c(CRAN="https://cran.uni-muenster.de/")) +``` -If you are using a venv virtual environment, the pipeline will call a local R installation, so make sure R is installed and install the required packages with the command we provide below. -(This executable requires that you specify a CRAN mirror in your `.Rprofile`) +Now, to automatically install the R dependecies, run: ``` panpipes install_r_dependencies @@ -126,13 +127,11 @@ A list of available pipelines should appear! You're all set to run `panpipes` on your local machine. -If you want to configure it on a HPC server, jump to [step 2](#step-2-pipeline-configuration) - - -## Step 2 pipeline configuration +If you want to configure it on a HPC server, follow the next instructions. +## Pipeline configuration for HPC clusters (For SGE or SLURM clusters) -*Note: You won't need this for a local installation of panpipes.* +*Note: You only need this configuration step if you want to use an HPC to dispatch individual task as separate parallel jobs. You won't need this for a local installation of panpipes.* Create a yml file for the cgat core pipeline software to read @@ -184,7 +183,7 @@ echo "export DRMAA_LIBRARY_PATH=$PATH_TO/libdrmaa.so.1.0" >> ~/.bashrc ``` ### Specifying Conda environments to run panpipes -If using conda environments, you can use one single big environment (the instructions provided do that) or create one for each of the workflows in panpipes, (i.e. one workflow = one environment) +If using conda environments, you can use one single big environment (the instructions provided do just that) or create one for each of the workflows in panpipes, (i.e. one workflow = one environment) The environment (s) should be specified in the .cgat.yml global configuration file or in each of the single workflows pipeline.yml configuration files and it will be picked up by the pipeline as the default environment. Please note that if you specify the conda environment in the workflows configuration file this will be the first choice to run the pipeline. diff --git a/panpipes/.DS_Store b/panpipes/.DS_Store index 761faf99b038057a6308488476f323d2c15f864d..e671240616eedab3f02b89bd64684889443947ff 100644 GIT binary patch delta 98 zcmZoMXffDuiiz>mwRN)?z<`kvLNo9~X&5zo^JL~Cmd)%OfB69kcNO*k delta 45 zcmZoMXffDuiiz>W${>`~YDx B4ygbD From 612f9a6d8f68f232218ff2edc0e6ac0365a88885 Mon Sep 17 00:00:00 2001 From: bio-la Date: Fri, 1 Dec 2023 10:06:46 +0100 Subject: [PATCH 7/9] fixed message entry --- panpipes/entry.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/panpipes/entry.py b/panpipes/entry.py index 3094631f..5d9f1982 100644 --- a/panpipes/entry.py +++ b/panpipes/entry.py @@ -36,7 +36,10 @@ def main(argv=None): '3. "integration" : integrate and batch correction using single and multimodal methods', '4. "clustering" : cell clustering on single modalities', '5. "refmap" : transfer scvi-tools models from published data to your data', - '6. "vis" : visualise metrics from other pipelines in context of experiment metadata'] + '6. "vis" : visualise metrics from other pipelines in context of experiment metadata', + '7. "qc_spatial" : for the ingestion of spatial transcriptomics (ST) data', + '8. "preprocess_spatial" : for filtering and normalizing ST data', + '9. "deconvolution_spatial" : for the cell type deconvolution of ST slides'] print(*pipelines_list, sep="\n") return command = argv[1] From 2842477c38b68c2197312c2447ae960cb5188acb Mon Sep 17 00:00:00 2001 From: bio-la Date: Fri, 1 Dec 2023 10:58:48 +0100 Subject: [PATCH 8/9] changed paths --- docs/release_notes.md | 1 + docs/usage/general_principles.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release_notes.md b/docs/release_notes.md index f530ad24..9e582f02 100644 --- a/docs/release_notes.md +++ b/docs/release_notes.md @@ -1,2 +1,3 @@ Release Notes ============== + diff --git a/docs/usage/general_principles.md b/docs/usage/general_principles.md index 2521fb1d..68de7853 100644 --- a/docs/usage/general_principles.md +++ b/docs/usage/general_principles.md @@ -92,4 +92,4 @@ When it's completed, you will find a message informing you it's done, like this ## Final notes -All panpipes workflow follow these general principles, with specific custom parameters and input files for each workflow. See the [Worflows](../workflows/) section for detailed info on each workflow and check out our [Tutorials](../tutorials/) for more examples. \ No newline at end of file +All panpipes workflow follow these general principles, with specific custom parameters and input files for each workflow. See the [Worflows](https://panpipes-pipelines.readthedocs.io/en/latest/workflows/index.html) section for detailed info on each workflow and check out our [Tutorials](https://panpipes-pipelines.readthedocs.io/en/latest/tutorials/index.html) for more examples. \ No newline at end of file From 672ca9ef9b5b85890da2b38525be0994c73e5730 Mon Sep 17 00:00:00 2001 From: bio-la Date: Fri, 1 Dec 2023 11:00:36 +0100 Subject: [PATCH 9/9] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28505b27..6350f19e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ - fixed lsi requirement for atac - fixed top features for atac - fixed filtering HVG for rna +- moved pynndescent to PyPi dependencies ### dependencies