diff --git a/panpipes/pipeline_integration.py b/panpipes/pipeline_integration.py index 571c7ca7..a989f821 100644 --- a/panpipes/pipeline_integration.py +++ b/panpipes/pipeline_integration.py @@ -160,6 +160,13 @@ def run_harmony(outfile): --modality rna """ # cannot use the normal method for importing params from yaml, because it only works up to depth 2 + harmony_params = PARAMS['rna']['harmony'] + if harmony_params['npcs'] is not None: + cmd += " --harmony_npcs %s" % harmony_params['npcs'] + if harmony_params['sigma'] is not None: + cmd += " --sigma_val %s" % harmony_params['sigma'] + if harmony_params['theta'] is not None: + cmd += " --theta_val %s" % harmony_params['theta'] neighbor_params = PARAMS['rna']['neighbors'] if neighbor_params['method'] is not None: cmd += " --neighbors_method %s" % neighbor_params['method'] @@ -321,6 +328,13 @@ def run_harmony_prot( outfile): --n_threads %(resources_threads_high)s """ cmd += " --modality prot" + harmony_params = PARAMS['prot']['harmony'] + if harmony_params['npcs'] is not None: + cmd += " --harmony_npcs %s" % harmony_params['npcs'] + if harmony_params['sigma'] is not None: + cmd += " --sigma_val %s" % harmony_params['sigma'] + if harmony_params['theta'] is not None: + cmd += " --theta_val %s" % harmony_params['theta'] neighbor_params = PARAMS['prot']['neighbors'] if neighbor_params['method'] is not None: cmd += " --neighbors_method %s" % neighbor_params['method'] @@ -416,9 +430,14 @@ def run_harmony_atac( outfile): --n_threads %(resources_threads_high)s --modality atac """ - - if PARAMS['atac_sigma'] is not None: - cmd += " --sigma_val %(atac_sigma)s" + + harmony_params = PARAMS['atac']['harmony'] + if harmony_params['npcs'] is not None: + cmd += " --harmony_npcs %s" % harmony_params['npcs'] + if harmony_params['sigma'] is not None: + cmd += " --sigma_val %s" % harmony_params['sigma'] + if harmony_params['theta'] is not None: + cmd += " --theta_val %s" % harmony_params['theta'] neighbor_params = PARAMS['atac']['neighbors'] if neighbor_params['method'] is not None: cmd += " --neighbors_method %s" % neighbor_params['method'] diff --git a/panpipes/pipeline_integration/pipeline.yml b/panpipes/pipeline_integration/pipeline.yml index 95f4b338..5f9fb2cb 100644 --- a/panpipes/pipeline_integration/pipeline.yml +++ b/panpipes/pipeline_integration/pipeline.yml @@ -54,9 +54,13 @@ rna: #----------------------------- # Harmony args #----------------------------- + harmony: # sigma value, used by Harmony - sigma: 0.1 - harmony_npcs: 30 + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 #----------------------------- # SCVI args #----------------------------- @@ -64,8 +68,8 @@ rna: exclude_mt_genes: True mt_column: mt model_args: - nlayers: - nlatent: + n_layers: + n_latent: gene_likelihood: zinb training_args: max_epochs: 400 @@ -87,6 +91,7 @@ rna: # number of Principal Components to calculate for neighbours and umap: # -if no correction is applied, PCA will be calculated and used to run UMAP and clustering on # -if Harmony is the method of choice, it will use these components to create a corrected dim red.) + # the maximum number of dims for neighbors calculation can only only be lower or equal to the total number of dims for PCA or Harmony # note: scvelo default is 30 npcs: 30 # number of neighbours @@ -111,9 +116,13 @@ prot: #---------------------------- # Harmony args #----------------------------- + harmony: # sigma value, used by Harmony - sigma: 0.1 #prot_sigma - harmony_npcs: 30 + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 #----------------------------› # find neighbour parameters #----------------------------- @@ -144,9 +153,13 @@ atac: #---------------------------- # Harmony args #----------------------------- + harmony: # sigma value, used by Harmony - sigma: 0.1 #prot_sigma - harmony_npcs: 30 + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 #---------------------------- # find neighbour parameters #----------------------------- diff --git a/panpipes/pipeline_qc_mm.py b/panpipes/pipeline_qc_mm.py index ae463972..83aa7c9d 100644 --- a/panpipes/pipeline_qc_mm.py +++ b/panpipes/pipeline_qc_mm.py @@ -285,9 +285,9 @@ def concat_bg_mudatas(infiles, outfile): """ if PARAMS['metadatacols'] is not None and PARAMS['metadatacols'] != "": cmd += " --metadatacols %(metadatacols)s" - if PARAMS["barcode_mtd_include"] is True: - cmd += " --barcode_mtd_df %(barcode_mtd_path)s" - cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s" + # if PARAMS["barcode_mtd_include"] is True: + # cmd += " --barcode_mtd_df %(barcode_mtd_path)s" + # cmd += " --barcode_mtd_metadatacols %(barcode_mtd_metadatacols)s" cmd += " > logs/concat_bg_mudatas.log" job_kwargs["job_threads"] = PARAMS['resources_threads_high'] P.run(cmd, **job_kwargs) diff --git a/python/batch_correct_harmony.py b/python/batch_correct_harmony.py index 12e79fb1..019b9ee8 100644 --- a/python/batch_correct_harmony.py +++ b/python/batch_correct_harmony.py @@ -35,9 +35,13 @@ help='') parser.add_argument('--n_threads', default=1, help="num threads to use for neighbor computations") +parser.add_argument('--harmony_npcs', default=30, + help="npcs for running harmony") parser.add_argument('--sigma_val', default=0.1, help="sigma") -parser.add_argument('--neighbors_n_pcs', +parser.add_argument('--theta_val', default=1.0, + help="theta") +parser.add_argument('--neighbors_n_pcs', default=30, help="n_pcs") parser.add_argument('--neighbors_method', help="neighbours method, scanpy or hnsw") @@ -73,18 +77,19 @@ adata.obs["comb_columns"] = adata.obs["comb_columns"].astype("category") # run harmony - ho = hm.run_harmony(adata.obsm['X_pca'][:,0:int(args.neighbors_n_pcs)], adata.obs, ["comb_columns"], - sigma = float(args.sigma_val), verbose=True,max_iter_kmeans=30, + ho = hm.run_harmony(adata.obsm['X_pca'][:,0:int(args.harmony_npcs)], adata.obs, ["comb_columns"], + sigma = float(args.sigma_val),theta = float(args.theta_val),verbose=True,max_iter_kmeans=30, max_iter_harmony=40) else: # make sure that batch is a categorical adata.obs[args.integration_col] = adata.obs[args.integration_col].astype("category") # run harmony - ho = hm.run_harmony(adata.obsm['X_pca'][:,0:int(args.neighbors_n_pcs)], + ho = hm.run_harmony(adata.obsm['X_pca'][:,0:int(args.harmony_npcs)], adata.obs, [args.integration_col], sigma=float(args.sigma_val), + theta = float(args.theta_val), verbose=True,max_iter_kmeans=30, max_iter_harmony=40) diff --git a/python/make_adata_from_csv.py b/python/make_adata_from_csv.py index d75c8d13..3d963547 100644 --- a/python/make_adata_from_csv.py +++ b/python/make_adata_from_csv.py @@ -144,7 +144,7 @@ # create new modality for hashing mdata.mod["hashing_ab"]=mdata["prot"][:, mdata["prot"].var["hashing_ab"]] # subset old modality to remove hashing - mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]] + mdata.mod['prot'] = mdata["prot"][:, ~mdata["prot"].var["hashing_ab"]] except FileNotFoundError: warnings.warn("protein metadata table not found") mdata.update() diff --git a/python/run_scanpyQC_rep.py b/python/run_scanpyQC_rep.py new file mode 100644 index 00000000..873bc62d --- /dev/null +++ b/python/run_scanpyQC_rep.py @@ -0,0 +1,140 @@ +''' +scanpy QC script GEX +order of QC: +- GEX +- ADT +- Repertoire +- ATAC +''' +import sys +import logging +L = logging.getLogger() +L.setLevel(logging.INFO) +log_handler = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') +log_handler.setFormatter(formatter) +L.addHandler(log_handler) + +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +import argparse +import pandas as pd +import muon as mu +import scirpy as ir +import matplotlib.pyplot as plt +plt.ioff() +from panpipes.funcs.io import write_obs +import yaml + +parser = argparse.ArgumentParser() +# required option +parser.add_argument("--sampleprefix", + default="", + help="prefix to prepend when saving the metadata file") +parser.add_argument("--input_mudata", + default="mdata_unfilt.h5mu", + help="") +parser.add_argument("--output_mudata", + default="mdata_unfilt.h5mu", + help="") +parser.add_argument("--figdir", + default="./figures/", + help="path to save the figures to") + +parser.add_argument("--distance_metrics", + default=None, type=yaml.safe_load, + help="what metrics to calculate sequence distance metric? \ + any arguments from scirpy.pp.ir_dist as a dict in format'{arg: value}' ") +parser.add_argument("--clonotype_metrics", + default=None,type=yaml.safe_load, + help="what metrics to calculate sequence distance metric? \ + any arguments from scirpy.pp.define_clonotypes as a dict in format'{arg: value}' ") +args, opt = parser.parse_known_args() +L.info(args) +mdata = mu.read(args.input_mudata) +rep = mdata['rep'] +# chain qc +ir.tl.chain_qc(rep) + +# remove nones, so defaults are used +if args.distance_metrics is not None: + # remove nones, so defaults are used + dist_args = {k: v for k, v in args.distance_metrics.items() if v != "None"} + dist_args = {k: v for k, v in dist_args.items() if v} +else: + dist_args = {} + +L.info("distance args:") +L.info(dist_args) + +ir.pp.ir_dist(rep, **dist_args) + +# pull function arguments from args. +if args.clonotype_metrics is not None: + # remove nones, so defaults are used + clonotype_args = {k: v for k, v in args.clonotype_metrics.items() if v != "None"} + clonotype_args = {k: v for k, v in clonotype_args.items() if v} +else: + clonotype_args={} +# define clonotypes +L.info("clonotypes args:") +L.info(clonotype_args) + +ir.tl.define_clonotypes(rep, **clonotype_args) +ir.tl.clonal_expansion(rep) + + +category_cols = ['has_ir'] +for cc in category_cols: + if pd.api.types.infer_dtype(rep.obs[cc]) != "categorical": + rep.obs[cc] = rep.obs[cc].astype('string').astype('category') + + +if "BCR" in rep.obs['receptor_type'].values: + bcr = rep[rep.obs.receptor_type =="BCR", :].copy() +else: + bcr=None + +if "TCR" in rep.obs.receptor_type.values: + tcr = rep[rep.obs.receptor_type =="TCR", :].copy() +else: + tcr=None + +# plot group abundances +fig, ax = plt.subplots() +ax = ir.pl.group_abundance(rep, groupby="receptor_type", ax=ax) +fig.savefig(args.figdir +"/barplot_group_abundance_receptor_type.png", bbox_inches="tight") + +if bcr is not None: + fig, ax = plt.subplots() + ax = ir.pl.group_abundance(bcr, groupby="chain_pairing", ax=ax) + fig.tight_layout() + fig.savefig(args.figdir +"/barplot_group_abundance_bcr_receptor_subtype.png", bbox_inches="tight") + +if tcr is not None: + fig, ax = plt.subplots() + ax = ir.pl.group_abundance(tcr, groupby="chain_pairing", ax=ax) + fig.tight_layout() + fig.savefig(args.figdir +"/barplot_group_abundance_tcr_receptor_subtype.png", bbox_inches="tight") + +# clonal expansion + +clone_counts = rep.obs.groupby("receptor_type").clone_id.value_counts().to_frame("cell_counts").reset_index() + +if bcr is not None: + ax = ir.pl.clonal_expansion(bcr, groupby="sample_id") + ax.set_title("bcr clonal expansion") + plt.savefig(args.figdir +"/barplot_clonal_expansion_bcr.png", bbox_inches="tight" ) + +if tcr is not None: + ax = ir.pl.clonal_expansion(tcr, groupby="sample_id") + ax.set_title("tcr clonal expansion") + plt.savefig(args.figdir +"/barplot_clonal_expansion_tcr.png", bbox_inches="tight") + + +L.info("saving anndata and obs in a metadata tsv file") +mdata.update() +write_obs(mdata, output_prefix=args.sampleprefix, + output_suffix="_cell_metadata.tsv") +mdata.write(args.output_mudata) \ No newline at end of file