From c3c9332196211481b3a4d7650ae66d28f4a62673 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Thu, 9 Apr 2020 19:44:06 -0700 Subject: [PATCH 01/19] Renamed to parameters communication_score and cci_score --- cell2cell/analysis/pipelines.py | 111 ++++++++++------------ cell2cell/core/interaction_space.py | 12 +-- cell2cell/preprocessing/integrate_data.py | 44 +++------ 3 files changed, 73 insertions(+), 94 deletions(-) diff --git a/cell2cell/analysis/pipelines.py b/cell2cell/analysis/pipelines.py index 804984a..02f4be7 100644 --- a/cell2cell/analysis/pipelines.py +++ b/cell2cell/analysis/pipelines.py @@ -13,7 +13,7 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_setup, analysis_setup, excluded_cells=None, - colors=None, use_ppi_score=False, filename_suffix='',verbose=True): + colors=None, metric='bray_curtis', use_ppi_score=False, filename_suffix='',verbose=True): if excluded_cells is None: excluded_cells = [] @@ -25,8 +25,8 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set interaction_space = ispace.InteractionSpace(rnaseq_data=rnaseq_data, ppi_data=bi_ppi_data, gene_cutoffs=cutoff_setup, - score_type=analysis_setup['score_type'], - score_metric=analysis_setup['score_metric'], + communication_score=analysis_setup['communication_score'], + cci_score=analysis_setup['cci_score'], verbose=verbose) interaction_space.compute_pairwise_interactions(use_ppi_score=use_ppi_score, @@ -62,10 +62,6 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set use_ppi_score=use_ppi_score ) - if use_ppi_score: - metric = 'cosine' - else: - metric = 'jaccard' interaction_clustermap = plotting.clustermap_cell_pairs_vs_ppi(interaction_matrix, metric=metric, metadata=metadata, @@ -74,11 +70,8 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set colors=colors, excluded_cells=excluded_cells, title='Active ligand-receptor pairs for interacting cells', - filename=files[ - 'output_folder'] + 'CCI-Active-LR-pairs{}.png'.format(filename_suffix), - **{'figsize': (20, 40), - 'vmin' : 0.0, - 'vmax': 1.0} + filename=files['output_folder'] + 'CCI-Active-LR-pairs{}.png'.format(filename_suffix), + **{'figsize': (20, 40)} ) interaction_clustermap.data2d.to_csv(files['output_folder'] + 'CCI-Active-LR-pairs{}.csv'.format(filename_suffix)) @@ -92,8 +85,53 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set return outputs +def ligand_receptor_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, analysis_setup, excluded_cells=None, + colors=None, use_ppi_score=False, filename_suffix='', verbose=True): + if excluded_cells is None: + excluded_cells = [] + + # Check for output directory + if not os.path.exists(files['output_folder']): + os.makedirs(files['output_folder']) + + # Load Data + rnaseq_data = read_data.load_rnaseq(rnaseq_file=files['rnaseq'], + gene_column=rnaseq_setup['gene_col'], + drop_nangenes=rnaseq_setup['drop_nangenes'], + log_transformation=rnaseq_setup['log_transform'], + format='auto', + verbose=verbose) + + ppi_data = read_data.load_ppi(ppi_file=files['ppi'], + interaction_columns=ppi_setup['protein_cols'], + rnaseq_genes=list(rnaseq_data.index), + format='auto', + verbose=verbose) + + meta = read_data.load_metadata(metadata_file=files['metadata'], + rnaseq_data=rnaseq_data, + sample_col=meta_setup['sample_col'], + format='auto', + verbose=verbose) + + # Run Analysis + outputs = core_pipeline(files=files, + rnaseq_data=rnaseq_data, + ppi_data=ppi_data, + metadata=meta, + meta_setup=meta_setup, + cutoff_setup=cutoff_setup, + analysis_setup=analysis_setup, + excluded_cells=excluded_cells, + colors=colors, + use_ppi_score=use_ppi_score, + filename_suffix=filename_suffix, + verbose=verbose) + return outputs + + def heuristic_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, go_setup, analysis_setup, - contact_go_terms = None, mediator_go_terms = None, interaction_type='combined', + contact_go_terms = None, mediator_go_terms = None, interaction_type='mediated', excluded_cells=None, colors=None, use_ppi_score=False, filename_suffix='', verbose=None): ''' This function performs the analysis with the default list of GO terms to filter the proteins in the PPI network. @@ -134,7 +172,7 @@ def heuristic_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, Returns ------- - subsampling_space : cell2cell.core.SubsamplingSpace + subsampling_space : cell2cell.core.SpatialCCI ppi_dict : dict @@ -221,49 +259,4 @@ def heuristic_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, use_ppi_score=use_ppi_score, filename_suffix=filename_suffix, verbose=verbose) - return outputs - - -def ligand_receptor_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, analysis_setup, excluded_cells=None, - colors=None, use_ppi_score=False, filename_suffix='', verbose=True): - if excluded_cells is None: - excluded_cells = [] - - # Check for output directory - if not os.path.exists(files['output_folder']): - os.makedirs(files['output_folder']) - - # Load Data - rnaseq_data = read_data.load_rnaseq(rnaseq_file=files['rnaseq'], - gene_column=rnaseq_setup['gene_col'], - drop_nangenes=rnaseq_setup['drop_nangenes'], - log_transformation=rnaseq_setup['log_transform'], - format='auto', - verbose=verbose) - - ppi_data = read_data.load_ppi(ppi_file=files['ppi'], - interaction_columns=ppi_setup['protein_cols'], - rnaseq_genes=list(rnaseq_data.index), - format='auto', - verbose=verbose) - - meta = read_data.load_metadata(metadata_file=files['metadata'], - rnaseq_data=rnaseq_data, - sample_col=meta_setup['sample_col'], - format='auto', - verbose=verbose) - - # Run Analysis - outputs = core_pipeline(files=files, - rnaseq_data=rnaseq_data, - ppi_data=ppi_data, - metadata=meta, - meta_setup=meta_setup, - cutoff_setup=cutoff_setup, - analysis_setup=analysis_setup, - excluded_cells=excluded_cells, - colors=colors, - use_ppi_score=use_ppi_score, - filename_suffix=filename_suffix, - verbose=verbose) return outputs \ No newline at end of file diff --git a/cell2cell/core/interaction_space.py b/cell2cell/core/interaction_space.py index 14a5dae..9dd28d7 100644 --- a/cell2cell/core/interaction_space.py +++ b/cell2cell/core/interaction_space.py @@ -84,12 +84,12 @@ class InteractionSpace(): ''' - def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, score_type='binary', score_metric='bray_curtis', - cci_matrix_template=None, verbose=True): + def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, communication_score='expression_thresholding', + cci_score='bray_curtis', cci_matrix_template=None, verbose=True): - self.score_type = score_type - self.score_metric = score_metric - if self.score_type == 'binary': + self.communication_score = communication_score + self.score_metric = cci_score + if self.communication_score == 'expression_thresholding': if 'type' in gene_cutoffs.keys(): cutoff_values = cutoffs.get_cutoffs(rnaseq_data=rnaseq_data, parameters=gene_cutoffs, @@ -102,7 +102,7 @@ def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, score_type='binary', sco self.ppi_data = ppi_data.copy() self.modified_rnaseq = integrate_data.get_modified_rnaseq(rnaseq_data=rnaseq_data, - score_type=self.score_type, + communication_score=self.communication_score, cutoffs=cutoff_values ) diff --git a/cell2cell/preprocessing/integrate_data.py b/cell2cell/preprocessing/integrate_data.py index 7f14e63..8953acf 100644 --- a/cell2cell/preprocessing/integrate_data.py +++ b/cell2cell/preprocessing/integrate_data.py @@ -5,19 +5,19 @@ from cell2cell.preprocessing import ppi, gene_ontology, rnaseq ## RNAseq datasets -def get_modified_rnaseq(rnaseq_data, score_type='binary', **kwargs): - if score_type == 'binary': - modified_rnaseq = get_binary_rnaseq(rnaseq_data, kwargs['cutoffs']) - elif score_type == 'absolute': +def get_modified_rnaseq(rnaseq_data, communication_score='expression_thresholding', **kwargs): + if communication_score == 'expression_thresholding': + modified_rnaseq = get_thresholded_rnaseq(rnaseq_data, kwargs['cutoffs']) + elif communication_score == 'absolute': #modified_rnaseq = rnaseq.divide_expression_by_max(rnaseq_data) modified_rnaseq = rnaseq_data.copy() else: # As other score types are implemented, other elif condition will be included here. - raise NotImplementedError("Score type {} to compute pairwise cell-interactions is not implemented".format(score_type)) + raise NotImplementedError("Score type {} to compute pairwise cell-interactions is not implemented".format(communication_score)) return modified_rnaseq -def get_binary_rnaseq(rnaseq_data, cutoffs): +def get_thresholded_rnaseq(rnaseq_data, cutoffs): binary_rnaseq_data = rnaseq_data.copy() columns = list(cutoffs.columns) if (len(columns) == 1) and ('value' in columns): @@ -51,29 +51,15 @@ def get_ppi_dict_from_proteins(ppi_data, contact_proteins, mediator_proteins=Non bidirectional=bidirectional, verbose=verbose) if mediator_proteins is not None: - ppi_dict['mediated'] = ppi.filter_ppi_network(ppi_data=ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type='mediated', - interaction_columns=interaction_columns, - bidirectional=bidirectional, - verbose=verbose) - - ppi_dict['combined'] = ppi.filter_ppi_network(ppi_data=ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type='combined', - interaction_columns=interaction_columns, - bidirectional=bidirectional, - verbose=verbose) - - ppi_dict['complete'] = ppi.filter_ppi_network(ppi_data=ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type='complete', - interaction_columns=interaction_columns, - bidirectional=bidirectional, - verbose=verbose) + for interaction_type in ['mediated', 'combined', 'complete']: + ppi_dict[interaction_type] = ppi.filter_ppi_network(ppi_data=ppi_data, + contact_proteins=contact_proteins, + mediator_proteins=mediator_proteins, + interaction_type=interaction_type, + interaction_columns=interaction_columns, + bidirectional=bidirectional, + verbose=verbose) + return ppi_dict From 93e434d421e9a24d6c9220e6103d65964801ba8d Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Thu, 16 Apr 2020 13:06:30 -0700 Subject: [PATCH 02/19] Added CCI scores from matrix multiplication (all cell pairs simultaneously) --- cell2cell/core/cci_scores.py | 96 ++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/cell2cell/core/cci_scores.py b/cell2cell/core/cci_scores.py index a8db656..e733c88 100644 --- a/cell2cell/core/cci_scores.py +++ b/cell2cell/core/cci_scores.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import numpy as np -import math + def compute_jaccard_like_cci_score(cell1, cell2, ppi_score=None): ''' @@ -87,7 +87,7 @@ def compute_braycurtis_like_cci_score(cell1, cell2, ppi_score=None): return cci_score -def compute_dot_product_score(cell1, cell2, ppi_score=None): +def compute_count_score(cell1, cell2, ppi_score=None): ''' Function that calculates an dot score for the interaction between two cells based on the interactions of their proteins with the proteins of the other cell. @@ -114,8 +114,96 @@ def compute_dot_product_score(cell1, cell2, ppi_score=None): if ppi_score is None: ppi_score = np.array([1.0] * len(c1)) - cci_score = np.nansum(c1 * c2 * ppi_score) + mult = c1 * c2 * ppi_score + cci_score = np.nansum(mult != 0) # Count all active pathways (different to zero) if cci_score is np.nan: return 0.0 - return cci_score \ No newline at end of file + return cci_score + + +def matmul_jaccard_like(A_scores, B_scores, ppi_score=None): + '''All cell-pair scores from two matrices. A_scores contains the communication scores for the partners on the left + column of the PPI network and B_score contains the same but for the partners in the right column. + + Parameters + __________ + A_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + B_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + Returns + ------- + jaccard : array + Matrix MxM, representing the CCI score for all cell pairs + ''' + if ppi_score is None: + ppi_score = np.array([1.0] * A_scores.shape[0]) + ppi_score = ppi_score.reshape((len(ppi_score), 1)) + + numerator = np.matmul(np.multiply(A_scores, ppi_score).transpose(), B_scores) + + A_module = np.sum(np.multiply(np.multiply(A_scores, A_scores), ppi_score), axis=0) + B_module = np.sum(np.multiply(np.multiply(B_scores, B_scores), ppi_score), axis=0) + denominator = A_module.reshape((A_module.shape[0], 1)) + B_module - numerator + + jaccard = np.divide(numerator, denominator) + return jaccard + + +def matmul_bray_curtis_like(A_scores, B_scores, ppi_score=None): + '''All cell-pair scores from two matrices. A_scores contains the communication scores for the partners on the left + column of the PPI network and B_score contains the same but for the partners in the right column. + + Parameters + __________ + A_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + B_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + Returns + ------- + bray_curtis : array + Matrix MxM, representing the CCI score for all cell pairs + ''' + if ppi_score is None: + ppi_score = np.array([1.0] * A_scores.shape[0]) + ppi_score = ppi_score.reshape((len(ppi_score), 1)) + + numerator = np.matmul(np.multiply(A_scores, ppi_score).transpose(), B_scores) + + A_module = np.sum(np.multiply(np.multiply(A_scores, A_scores), ppi_score), axis=0) + B_module = np.sum(np.multiply(np.multiply(B_scores, B_scores), ppi_score), axis=0) + denominator = A_module.reshape((A_module.shape[0], 1)) + B_module + + bray_curtis = np.divide(2.0*numerator, denominator) + return bray_curtis + + +def matmul_count_active(A_scores, B_scores, ppi_score=None): + '''All cell-pair scores from two matrices. A_scores contains the communication scores for the partners on the left + column of the PPI network and B_score contains the same but for the partners in the right column. + + Parameters + __________ + A_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + B_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + Returns + ------- + counts : array + Matrix MxM, representing the CCI score for all cell pairs + ''' + if ppi_score is None: + ppi_score = np.array([1.0] * A_scores.shape[0]) + ppi_score = ppi_score.reshape((len(ppi_score), 1)) + + counts = np.matmul(np.multiply(A_scores, ppi_score).transpose(), B_scores) + return counts \ No newline at end of file From e5ac1f48ddbb570ccd056fb838ace4d019ca77ff Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Thu, 16 Apr 2020 13:07:27 -0700 Subject: [PATCH 03/19] Added new methods --- cell2cell/core/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cell2cell/core/__init__.py b/cell2cell/core/__init__.py index acad71a..190f963 100644 --- a/cell2cell/core/__init__.py +++ b/cell2cell/core/__init__.py @@ -2,7 +2,8 @@ from __future__ import absolute_import -from cell2cell.core.cci_scores import (compute_braycurtis_like_cci_score, compute_dot_product_score, compute_jaccard_like_cci_score) +from cell2cell.core.cci_scores import (compute_braycurtis_like_cci_score, compute_count_score, compute_jaccard_like_cci_score, + matmul_bray_curtis_like, matmul_count_active, matmul_jaccard_like) from cell2cell.core.cell import (Cell, get_cells_from_rnaseq) from cell2cell.core.interaction_space import (generate_interaction_elements, InteractionSpace) -from cell2cell.core.subsampling import (subsampling_operation, SubsamplingSpace) \ No newline at end of file +from cell2cell.core.spatial_interactions import (SpatialCCI) \ No newline at end of file From 723750108607aac58c286837609538841a2c67c3 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Thu, 16 Apr 2020 13:07:48 -0700 Subject: [PATCH 04/19] Fixed PPI network template for cells --- cell2cell/core/cell.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cell2cell/core/cell.py b/cell2cell/core/cell.py index ad25191..60eb395 100644 --- a/cell2cell/core/cell.py +++ b/cell2cell/core/cell.py @@ -45,9 +45,7 @@ def __init__(self, sc_rnaseq_data, verbose=True): self.rnaseq_data.columns = ['value'] # Binary ppi datasets - self.weighted_ppi = pd.DataFrame(columns=['A','score']) - # 'B' removed after including bidirectional interactions to use only one column - # self.weighted_ppi = pd.DataFrame(columns=['A', 'B', 'score']) + self.weighted_ppi = pd.DataFrame(columns=['A', 'B', 'score']) # Object created if verbose: From fe6f89a2fe03d2f6e1c9caef5249fa20bbcf2382 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:38:28 -0700 Subject: [PATCH 05/19] Added communication scores --- cell2cell/core/communication_scores.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cell2cell/core/communication_scores.py diff --git a/cell2cell/core/communication_scores.py b/cell2cell/core/communication_scores.py new file mode 100644 index 0000000..ffaf4fb --- /dev/null +++ b/cell2cell/core/communication_scores.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import numpy as np + + +def get_binary_scores(cell1, cell2, ppi_score=None): + c1 = cell1.weighted_ppi['A'].values + c2 = cell2.weighted_ppi['B'].values + + if (len(c1) == 0) or (len(c2) == 0): + return 0.0 + + if ppi_score is None: + ppi_score = np.array([1.0] * len(c1)) + + communication_scores = c1 * c2 * ppi_score + return communication_scores + + +def get_continuous_scores(cell1, cell2, ppi_score=None): + raise ValueError("Continuous communication scores not implemented yet") \ No newline at end of file From 4fe5a8b533d6f128e15661a59cfe4598e43090f6 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:38:58 -0700 Subject: [PATCH 06/19] Future new features --- cell2cell/analysis/pipelines.py | 58 +++++++------ cell2cell/core/spatial_interactions.py | 20 +++++ cell2cell/core/subsampling.py | 116 ------------------------- 3 files changed, 50 insertions(+), 144 deletions(-) create mode 100644 cell2cell/core/spatial_interactions.py delete mode 100644 cell2cell/core/subsampling.py diff --git a/cell2cell/analysis/pipelines.py b/cell2cell/analysis/pipelines.py index 02f4be7..519c57a 100644 --- a/cell2cell/analysis/pipelines.py +++ b/cell2cell/analysis/pipelines.py @@ -4,7 +4,6 @@ import os -from cell2cell.analysis import ppi_enrichment from cell2cell.core import interaction_space as ispace from cell2cell.datasets import heuristic_data from cell2cell.io import read_data @@ -18,8 +17,12 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set if excluded_cells is None: excluded_cells = [] - bi_ppi_data = ppi.bidirectional_ppi_for_cci(ppi_data=ppi_data, - verbose=verbose) + if analysis_setup['cci_type'] == 'undirected': + bi_ppi_data = ppi.bidirectional_ppi_for_cci(ppi_data=ppi_data, verbose=False) + ref_ppi = ppi_data + else: + bi_ppi_data = ppi_data.copy() + ref_ppi = None interaction_space = ispace.InteractionSpace(rnaseq_data=rnaseq_data, @@ -27,10 +30,15 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set gene_cutoffs=cutoff_setup, communication_score=analysis_setup['communication_score'], cci_score=analysis_setup['cci_score'], + cci_type=analysis_setup['cci_type'], verbose=verbose) - interaction_space.compute_pairwise_interactions(use_ppi_score=use_ppi_score, - verbose=verbose) + interaction_space.compute_pairwise_cci_scores(use_ppi_score=use_ppi_score, + verbose=verbose) + + interaction_space.compute_pairwise_communication_scores(ref_ppi_data=ref_ppi, + use_ppi_score=use_ppi_score, + verbose=verbose) clustermap = plotting.clustermap_cci(interaction_space, method='ward', @@ -54,25 +62,17 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set filename=files['output_folder'] + 'CCI-PCoA-CCI-scores{}.png'.format(filename_suffix), ) - interaction_matrix, std_interaction_matrix = ppi_enrichment.get_ppi_score_for_cell_pairs( - cells=list(interaction_space.distance_matrix.columns), - subsampled_interactions=[interaction_space.interaction_elements], - ppi_data=bi_ppi_data, - ref_ppi_data=ppi_data, - use_ppi_score=use_ppi_score - ) - - interaction_clustermap = plotting.clustermap_cell_pairs_vs_ppi(interaction_matrix, - metric=metric, - metadata=metadata, - sample_col=meta_setup['sample_col'], - group_col=meta_setup['group_col'], - colors=colors, - excluded_cells=excluded_cells, - title='Active ligand-receptor pairs for interacting cells', - filename=files['output_folder'] + 'CCI-Active-LR-pairs{}.png'.format(filename_suffix), - **{'figsize': (20, 40)} - ) + interaction_clustermap = plotting.clustermap_ccc(interaction_space, + metric=metric, + metadata=metadata, + sample_col=meta_setup['sample_col'], + group_col=meta_setup['group_col'], + colors=colors, + excluded_cells=excluded_cells, + title='Active ligand-receptor pairs for interacting cells', + filename=files['output_folder'] + 'CCI-Active-LR-pairs{}.png'.format(filename_suffix), + **{'figsize': (20, 40)} + ) interaction_clustermap.data2d.to_csv(files['output_folder'] + 'CCI-Active-LR-pairs{}.csv'.format(filename_suffix)) @@ -91,8 +91,9 @@ def ligand_receptor_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_ excluded_cells = [] # Check for output directory - if not os.path.exists(files['output_folder']): - os.makedirs(files['output_folder']) + if 'output_folder' in files.keys(): + if not os.path.exists(files['output_folder']): + os.makedirs(files['output_folder']) # Load Data rnaseq_data = read_data.load_rnaseq(rnaseq_file=files['rnaseq'], @@ -183,8 +184,9 @@ def heuristic_pipeline(files, rnaseq_setup, ppi_setup, meta_setup, cutoff_setup, excluded_cells = [] # Check for output directory - if not os.path.exists(files['output_folder']): - os.makedirs(files['output_folder']) + if 'output_folder' in files.keys(): + if not os.path.exists(files['output_folder']): + os.makedirs(files['output_folder']) # Load Data rnaseq_data = read_data.load_rnaseq(rnaseq_file=files['rnaseq'], diff --git a/cell2cell/core/spatial_interactions.py b/cell2cell/core/spatial_interactions.py new file mode 100644 index 0000000..3594ae7 --- /dev/null +++ b/cell2cell/core/spatial_interactions.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import numpy as np +import pandas as pd +import random + +from cell2cell.core import InteractionSpace +from cell2cell.utils import parallel_computing +from contextlib import closing +from multiprocessing import Pool + + +class SpatialCCI: + + def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, spatial_dict, score_type='binary', score_metric='bray_curtis', + n_jobs=1, verbose=True): + pass + # TODO IMPLEMENT SPATIAL CCI \ No newline at end of file diff --git a/cell2cell/core/subsampling.py b/cell2cell/core/subsampling.py deleted file mode 100644 index f227bff..0000000 --- a/cell2cell/core/subsampling.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import numpy as np -import pandas as pd -import random - -from cell2cell.clustering import compute_avg_cci_matrix -from cell2cell.core import InteractionSpace -from cell2cell.utils import parallel_computing -from contextlib import closing -from multiprocessing import Pool - - -class SubsamplingSpace: - - def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, score_type='binary', score_metric='bray_curtis', - subsampling_percentage=1.0, iterations=1, initial_seed=None, n_jobs=1, verbose=True): - - if verbose: - print("Computing interactions by sub-sampling a" - " {}% of cells in each of the {} iterations".format(int(100*subsampling_percentage), iterations)) - - self.cell_ids = list(rnaseq_data.columns) - - data = np.empty((len(self.cell_ids), len(self.cell_ids))) - data[:] = np.nan - self.cci_matrix_template = pd.DataFrame(data, - columns=self.cell_ids, - index=self.cell_ids) - - self.subsampled_interactions = self.subsampling_interactions(rnaseq_data=rnaseq_data, - ppi_data=ppi_data, - gene_cutoffs=gene_cutoffs, - score_type=score_type, - score_metric=score_metric, - subsampling_percentage=subsampling_percentage, - iterations=iterations, - initial_seed=initial_seed, - n_jobs=n_jobs, - verbose=verbose) - - self.average_cci_matrix = compute_avg_cci_matrix(self) - - - def subsampling_interactions(self, rnaseq_data, ppi_data, gene_cutoffs, score_type='binary', - score_metric='bray_curtis', subsampling_percentage=1.0, iterations=1, initial_seed=None, - n_jobs=1, verbose=True): - ''' - This function performs the sub-sampling method by generating a list of cells to consider in each iteration. - Then, for each list of cells a InteractionSpace is generated to perform the cell2cell analysis and return the - respective CCI Matrix and clusters. - ''' - # Last position for sub-sampling when shuffling - last_item = int(len(self.cell_ids) * subsampling_percentage) - if last_item == 0: - last_item = 1 - - inputs = {'cells' : self.cell_ids, - 'list_end' : last_item, - 'rnaseq' : rnaseq_data, - 'ppi' : ppi_data, - 'cutoffs' : gene_cutoffs, - 'score_type' : score_type, - 'score_metric' : score_metric, - 'cci_matrix' : self.cci_matrix_template, - 'verbose' :verbose - } - - inputs_list = [] - for i in range(iterations): - inputs_ = inputs.copy() - if initial_seed is not None: - inputs_['seed'] = initial_seed + i - else: - inputs_['seed'] = None - inputs_list.append(inputs_) - - # Parallel computing - if n_jobs == 1: - results = list(map(parallel_computing.parallel_subsampling_interactions, inputs_list)) - else: - agents = parallel_computing.agents_number(n_jobs) - chunksize = 1 - with closing(Pool(processes=agents)) as pool: - results = pool.map(parallel_computing.parallel_subsampling_interactions, inputs_list, chunksize) - return results - - -def subsampling_operation(cell_ids, last_item, rnaseq_data, ppi_data, gene_cutoffs, score_type='binary', - score_metric='bray_curtis', cci_matrix_template=None, seed=None, verbose=True): - ''' - Functional unit to perform parallel computing in Sub-sampling Space - ''' - cell_ids_ = cell_ids.copy() - if seed is not None: - random.seed(seed) - random.shuffle(cell_ids_) - included_cells = cell_ids_[:last_item] - - interaction_space = InteractionSpace(rnaseq_data=rnaseq_data[included_cells], - ppi_data=ppi_data, - gene_cutoffs=gene_cutoffs, - score_type=score_type, - cci_matrix_template=cci_matrix_template, - verbose=verbose) - - interaction_space.compute_pairwise_interactions(score_metric=score_metric, - verbose=verbose) - - interaction_elements = dict() - interaction_elements['cells'] = included_cells - interaction_elements['cci_matrix'] = interaction_space.interaction_elements['cci_matrix'] - interaction_elements['ppis'] = interaction_space.interaction_elements['ppis'] - return interaction_elements \ No newline at end of file From 35db56f785dbd648894cada17dfd4e87ec9354ea Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:39:25 -0700 Subject: [PATCH 07/19] New format of scores for CCIs and CCC --- cell2cell/core/cci_scores.py | 33 ++++- cell2cell/core/interaction_space.py | 191 +++++++++++++++++++++------- 2 files changed, 179 insertions(+), 45 deletions(-) diff --git a/cell2cell/core/cci_scores.py b/cell2cell/core/cci_scores.py index e733c88..23227e0 100644 --- a/cell2cell/core/cci_scores.py +++ b/cell2cell/core/cci_scores.py @@ -206,4 +206,35 @@ def matmul_count_active(A_scores, B_scores, ppi_score=None): ppi_score = ppi_score.reshape((len(ppi_score), 1)) counts = np.matmul(np.multiply(A_scores, ppi_score).transpose(), B_scores) - return counts \ No newline at end of file + return counts + + +def matmul_cosine(A_scores, B_scores, ppi_score=None): + '''All cell-pair scores from two matrices. A_scores contains the communication scores for the partners on the left + column of the PPI network and B_score contains the same but for the partners in the right column. + + Parameters + __________ + A_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + B_scores : array + Matrix of size NxM, where N is the number of PPIs and M is the number of individual cells. + + Returns + ------- + counts : array + Matrix MxM, representing the CCI score for all cell pairs + ''' + if ppi_score is None: + ppi_score = np.array([1.0] * A_scores.shape[0]) + ppi_score = ppi_score.reshape((len(ppi_score), 1)) + + numerator = np.matmul(np.multiply(A_scores, ppi_score).transpose(), B_scores) + + A_module = np.sum(np.multiply(np.multiply(A_scores, A_scores), ppi_score), axis=0) ** 0.5 + B_module = np.sum(np.multiply(np.multiply(B_scores, B_scores), ppi_score), axis=0) ** 0.5 + denominator = A_module.reshape((A_module.shape[0], 1)) * B_module + + cosine = np.divide(numerator, denominator) + return cosine \ No newline at end of file diff --git a/cell2cell/core/interaction_space.py b/cell2cell/core/interaction_space.py index 9dd28d7..d87d01a 100644 --- a/cell2cell/core/interaction_space.py +++ b/cell2cell/core/interaction_space.py @@ -3,14 +3,25 @@ from __future__ import absolute_import from cell2cell.preprocessing import integrate_data, cutoffs -from cell2cell.core import cell, cci_scores +from cell2cell.core import cell, cci_scores, communication_scores import itertools import numpy as np import pandas as pd -def generate_interaction_elements(modified_rnaseq, ppi_data, cci_matrix_template=None, verbose=True): +def generate_pairs(cells, cci_type): + if cci_type == 'directed': + pairs = list(itertools.combinations(cells + cells, 2)) # Directed + elif cci_type == 'undirected': + pairs = list(itertools.combinations(cells, 2)) + [(c, c) for c in cells] # Undirected + else: + raise NotImplementedError("CCI type has to be directed or undirected") + pairs = list(set(pairs)) # Remove duplicates + return pairs + + +def generate_interaction_elements(modified_rnaseq, ppi_data, cci_type='undirected', cci_matrix_template=None, verbose=True): '''Create all elements of pairwise interactions needed to perform the analyses. All these variables are used by the class InteractionSpace. @@ -36,38 +47,32 @@ def generate_interaction_elements(modified_rnaseq, ppi_data, cci_matrix_template cell_number = len(cell_instances) # Generate pairwise interactions - # pairwise_interactions = list(itertools.combinations(cell_instances + cell_instances, 2)) - pairwise_interactions = list(itertools.combinations(cell_instances, 2)) + [(c, c) for c in cell_instances] - pairwise_interactions = list(set(pairwise_interactions)) # Remove duplicates + pairwise_interactions = generate_pairs(cell_instances, cci_type) # Interaction elements - interaction_space = {} - interaction_space['pairs'] = pairwise_interactions - interaction_space['cells'] = cell.get_cells_from_rnaseq(modified_rnaseq, verbose=verbose) + interaction_elements = {} + interaction_elements['cell_names'] = cell_instances + interaction_elements['pairs'] = pairwise_interactions + interaction_elements['cells'] = cell.get_cells_from_rnaseq(modified_rnaseq, verbose=verbose) # Cell-specific binary ppi - for cell_instance in interaction_space['cells'].values(): + interaction_elements['A_score'] = np.array([], dtype=np.int64)#.reshape(ppi_data.shape[0],0) + interaction_elements['B_score'] = np.array([], dtype=np.int64)#.reshape(ppi_data.shape[0],0) + for cell_instance in interaction_elements['cells'].values(): cell_instance.weighted_ppi = integrate_data.get_weighted_ppi(ppi_data=ppi_data, modified_rnaseq_data=cell_instance.rnaseq_data, column='value') + interaction_elements['A_score'] = np.hstack([interaction_elements['A_score'], cell_instance.weighted_ppi['A'].values]) + interaction_elements['B_score'] = np.hstack([interaction_elements['B_score'], cell_instance.weighted_ppi['B'].values]) # Cell-cell interaction matrix if cci_matrix_template is None: - interaction_space['cci_matrix'] = pd.DataFrame(np.zeros((cell_number, cell_number)), + interaction_elements['cci_matrix'] = pd.DataFrame(np.zeros((cell_number, cell_number)), columns=cell_instances, index=cell_instances) else: - interaction_space['cci_matrix'] = cci_matrix_template - - # PPIs for each cell - empty_ppi = np.empty(ppi_data.shape) - empty_ppi[:] = np.nan - interaction_space['ppis'] = dict.fromkeys(list(interaction_space['cells'].values()), - pd.DataFrame(empty_ppi, columns=['A', 'B', 'score'])) - - for name, cell_unit in interaction_space['cells'].items(): - interaction_space['ppis'][name] = cell_unit.weighted_ppi - return interaction_space + interaction_elements['cci_matrix'] = cci_matrix_template + return interaction_elements class InteractionSpace(): @@ -85,10 +90,12 @@ class InteractionSpace(): ''' def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, communication_score='expression_thresholding', - cci_score='bray_curtis', cci_matrix_template=None, verbose=True): + cci_score='bray_curtis', cci_type='undirected', cci_matrix_template=None, verbose=True): self.communication_score = communication_score - self.score_metric = cci_score + self.cci_score = cci_score + self.cci_type = cci_type + if self.communication_score == 'expression_thresholding': if 'type' in gene_cutoffs.keys(): cutoff_values = cutoffs.get_cutoffs(rnaseq_data=rnaseq_data, @@ -113,7 +120,7 @@ def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, communication_score='exp self.interaction_elements['ppi_score'] = self.ppi_data['score'].values - def pairwise_interaction(self, cell1, cell2, score_metric='bray_curtis', use_ppi_score=False, verbose=True): + def pair_cci_score(self, cell1, cell2, cci_score='bray_curtis', use_ppi_score=False, verbose=True): ''' Function that performs the interaction analysis of a pair of cells. @@ -139,22 +146,24 @@ def pairwise_interaction(self, cell1, cell2, score_metric='bray_curtis', use_ppi ''' if verbose: - print("Computing interaction between {} and {}".format(cell1.type, cell2.type)) + print("Computing interaction score between {} and {}".format(cell1.type, cell2.type)) if use_ppi_score: ppi_score = self.ppi_data['score'].values else: ppi_score = None # Calculate cell-cell interaction score - if score_metric == 'bray_curtis': - cci_score = cci_scores.compute_braycurtis_like_cci_score(cell1, cell2, ppi_score=ppi_score) - elif score_metric == 'jaccard': - cci_score = cci_scores.compute_jaccard_like_cci_score(cell1, cell2, ppi_score=ppi_score) + if cci_score == 'bray_curtis': + cci_value = cci_scores.compute_braycurtis_like_cci_score(cell1, cell2, ppi_score=ppi_score) + elif cci_score == 'jaccard': + cci_value = cci_scores.compute_jaccard_like_cci_score(cell1, cell2, ppi_score=ppi_score) + elif cci_score == 'count': + cci_value = cci_scores.compute_count_score(cell1, cell2, ppi_score=ppi_score) else: - raise NotImplementedError("Score metric {} to compute pairwise cell-interactions is not implemented".format(score_metric)) - return cci_score + raise NotImplementedError("CCI score {} to compute pairwise cell-interactions is not implemented".format(cci_score)) + return cci_value - def compute_pairwise_interactions(self, score_metric=None, use_ppi_score=False, verbose=True): + def compute_pairwise_cci_scores(self, cci_score=None, use_ppi_score=False, verbose=True): ''' Function that computes... @@ -167,26 +176,120 @@ def compute_pairwise_interactions(self, score_metric=None, use_ppi_score=False, ''' - if score_metric is None: - score_metric = self.score_metric + if cci_score is None: + cci_score = self.cci_score else: - assert isinstance(score_metric, str) + assert isinstance(cci_score, str) ### Compute pairwise physical interactions if verbose: print("Computing pairwise interactions") + # Compute pair by pair for pair in self.interaction_elements['pairs']: cell1 = self.interaction_elements['cells'][pair[0]] cell2 = self.interaction_elements['cells'][pair[1]] - cci_score = self.pairwise_interaction(cell1, - cell2, - score_metric=score_metric, - use_ppi_score=use_ppi_score, - verbose=verbose) - self.interaction_elements['cci_matrix'].loc[pair[0], pair[1]] = cci_score - self.interaction_elements['cci_matrix'].loc[pair[1], pair[0]] = cci_score + cci_value = self.pair_cci_score(cell1, + cell2, + cci_score=cci_score, + use_ppi_score=use_ppi_score, + verbose=verbose) + self.interaction_elements['cci_matrix'].loc[pair[0], pair[1]] = cci_value + if self.cci_type == 'undirected': + self.interaction_elements['cci_matrix'].loc[pair[1], pair[0]] = cci_value + + # Compute using matmul -> Too slow and uses a lot of memory TODO: Try to optimize this + # if cci_score == 'bray_curtis': + # cci_matrix = cci_scores.matmul_bray_curtis_like(self.interaction_elements['A_score'], + # self.interaction_elements['B_score']) + # self.interaction_elements['cci_matrix'] = pd.DataFrame(cci_matrix, + # index=self.interaction_elements['cell_names'], + # columns=self.interaction_elements['cell_names'] + # ) # Generate distance matrix - self.distance_matrix = self.interaction_elements['cci_matrix'].apply(lambda x: 1 - x) - np.fill_diagonal(self.distance_matrix.values, 0.0) # Make diagonal zero (delete autocrine-interactions) \ No newline at end of file + if cci_score != 'count': + self.distance_matrix = self.interaction_elements['cci_matrix'].apply(lambda x: 1 - x) + else: + self.distance_matrix = self.interaction_elements['cci_matrix'].div(self.interaction_elements['cci_matrix'].max().max()).apply(lambda x: 1 - x) + np.fill_diagonal(self.distance_matrix.values, 0.0) # Make diagonal zero (delete autocrine-interactions) + + def pair_communication_score(self, cell1, cell2, communication_score='expression_thresholding', + use_ppi_score=False, verbose=True): + # TODO: Implement communication scores + if verbose: + print("Computing communication score between {} and {}".format(cell1.type, cell2.type)) + + # Check that new score is the same type as score used to build interaction space (binary or continuous) + if (communication_score in ['expression_product', 'coexpression']) \ + & (self.communication_score in ['expression_thresholding', 'differential_combinations']): + raise ValueError('Cannot use {} for this interaction space'.format(communication_score)) + if (communication_score in ['expression_thresholding', 'differential_combinations']) \ + & (self.communication_score in ['expression_product', 'expression_correlation']): + raise ValueError('Cannot use {} for this interaction space'.format(communication_score)) + + if use_ppi_score: + ppi_score = self.ppi_data['score'].values + else: + ppi_score = None + + if (communication_score == 'expression_thresholding') | (communication_score == 'differential_combinations'): + communication_value = communication_scores.get_binary_scores(cell1, cell2, ppi_score=ppi_score) + elif (communication_score == 'expression_product') | (communication_score == 'expression_correlation'): + communication_value = communication_scores.get_continuous_scores(cell1, cell2, ppi_score=ppi_score) + else: + raise NotImplementedError( + "Communication score {} to compute pairwise cell-communication is not implemented".format(communication_score)) + return communication_value + + def compute_pairwise_communication_scores(self, communication_score=None, use_ppi_score=False, ref_ppi_data=None, + cells=None, verbose=True): + ''' + Function that computes... + + Parameters + ---------- + + + Returns + ------- + + + ''' + if communication_score is None: + communication_score = self.communication_score + else: + assert isinstance(communication_score, str) + + # Labels: + if cells is None: + cell_pairs = self.interaction_elements['pairs'] + else: + cell_pairs = generate_pairs(cells, self.cci_type) + col_labels = ['{};{}'.format(pair[0], pair[1]) for pair in cell_pairs] + if ref_ppi_data is None: + ref_index = self.ppi_data.apply(lambda row: (row[0], row[1]), axis=1) + keep_index = list(range(self.ppi_data.shape[0])) + else: + ref_index = list(ref_ppi_data.apply(lambda row: (row[0], row[1]), axis=1).values) + keep_index = list(pd.merge(self.ppi_data, ref_ppi_data, how='inner').index) + + # DataFrame to Store values + communication_matrix = pd.DataFrame(index=ref_index, columns=col_labels) + + ### Compute pairwise physical interactions + if verbose: + print("Computing pairwise communication") + + for i, pair in enumerate(cell_pairs): + cell1 = self.interaction_elements['cells'][pair[0]] + cell2 = self.interaction_elements['cells'][pair[1]] + + comm_score = self.pair_communication_score(cell1, + cell2, + communication_score=communication_score, + use_ppi_score=use_ppi_score, + verbose=verbose) + communication_matrix[col_labels[i]] = comm_score.flatten()[keep_index] + + self.communication_matrix = communication_matrix \ No newline at end of file From 77f5d5e31629086f8ce229b465f9f15ba45a6e3f Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:45:06 -0700 Subject: [PATCH 08/19] Place for future features --- cell2cell/utils/parallel_computing.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/cell2cell/utils/parallel_computing.py b/cell2cell/utils/parallel_computing.py index 702fe0e..ca9497f 100644 --- a/cell2cell/utils/parallel_computing.py +++ b/cell2cell/utils/parallel_computing.py @@ -4,6 +4,7 @@ from multiprocessing import cpu_count + # GENERAL def agents_number(n_jobs): ''' @@ -23,21 +24,16 @@ def agents_number(n_jobs): agents = n_jobs return agents + # CORE FUNCTIONS -def parallel_subsampling_interactions(inputs): +def parallel_spatial_ccis(inputs): ''' - Parallel computing in cell2cell2.subsampling.SubsamplingSpace + Parallel computing in cell2cell2.spatial_interactions.SpatialCCIs ''' - from cell2cell.core import subsampling_operation - results = subsampling_operation(cell_ids=inputs['cells'], - last_item= inputs['list_end'], - rnaseq_data=inputs['rnaseq'], - ppi_data=inputs['ppi'], - gene_cutoffs=inputs['cutoffs'], - score_type=inputs['score_type'], - score_metric=inputs['score_metric'], - cci_matrix_template=inputs['cci_matrix'], - seed=inputs['seed'], - verbose=inputs['verbose']) - - return results \ No newline at end of file + # TODO: Implement this for enabling spatial analysis and compute interactions in parallel + + # from cell2cell.core import spatial_operation + #results = spatial_operation() + + # return results + pass \ No newline at end of file From 4adeb05df1706153e2a68ca7eacc75cdb80b0f5c Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:45:25 -0700 Subject: [PATCH 09/19] Changes in way of subsampling --- cell2cell/stats/null_distribution.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cell2cell/stats/null_distribution.py b/cell2cell/stats/null_distribution.py index 5a9675d..ba7ae70 100644 --- a/cell2cell/stats/null_distribution.py +++ b/cell2cell/stats/null_distribution.py @@ -2,6 +2,7 @@ from __future__ import absolute_import +import random import numpy as np import pandas as pd @@ -151,7 +152,14 @@ def check_presence_in_dataframe(df, elements, columns=None): def subsample_dataframe(df, n_samples, random_state=None): - subsampled_df = df.sample(n_samples, random_state=random_state).reset_index(drop=True) + items = list(df.index) + if n_samples > len(items): + n_samples = len(items) + if isinstance(random_state, int): + random.seed(random_state) + random.shuffle(items) + + subsampled_df = df.loc[items[:n_samples],:] return subsampled_df From 7e1324a15ca088c8ec0c7725d1c75bace9dae882 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:45:46 -0700 Subject: [PATCH 10/19] Adapted to new code structure --- cell2cell/utils/plotting.py | 200 +++++++++++++++++++----------------- 1 file changed, 108 insertions(+), 92 deletions(-) diff --git a/cell2cell/utils/plotting.py b/cell2cell/utils/plotting.py index 1d275ab..f37f601 100644 --- a/cell2cell/utils/plotting.py +++ b/cell2cell/utils/plotting.py @@ -29,7 +29,7 @@ def get_colors_from_labels(labels, cmap='gist_rainbow', factor=1): return colors -def clustermap_cci(interaction_space, method='ward', optimal_leaf =True, metadata=None, sample_col='#SampleID', +def clustermap_cci(interaction_space, method='ward', optimal_leaf=True, metadata=None, sample_col='#SampleID', group_col='Groups', meta_cmap='gist_rainbow', colors=None, excluded_cells=None, title='', bar_title='CCI score', cbar_fontsize='12', filename=None, **kwargs): if hasattr(interaction_space, 'distance_matrix'): @@ -50,50 +50,54 @@ def clustermap_cci(interaction_space, method='ward', optimal_leaf =True, metadat else: df = distance_matrix - # Compute linkage - D = sp.distance.squareform(df) - if 'col_cluster' in kwargs.keys(): - if kwargs['col_cluster']: - linkage = hc.linkage(D, method=method, optimal_ordering=optimal_leaf) + # Check symmetry to run linkage separately + symmetric = (df.values.transpose() == df.values).all() + if symmetric: + # TODO: implement heatmap when distance matrix is not symmetric + # Compute linkage + D = sp.distance.squareform(df) + if 'col_cluster' in kwargs.keys(): + if kwargs['col_cluster']: + linkage = hc.linkage(D, method=method, optimal_ordering=optimal_leaf) + else: + linkage = None else: - linkage = None - else: - linkage = hc.linkage(D, method=method, optimal_ordering=optimal_leaf) - - # Plot hierarchical clustering - hier = sns.clustermap(df, - col_linkage=linkage, - row_linkage=linkage, - **kwargs - ) - plt.close() - - # Triangular matrix - mask = np.zeros((df.shape[0], df.shape[1])) - order_map = dict() - if linkage is None: - ind_order = range(hier.data2d.shape[0]) - else: - ind_order = hier.dendrogram_col.reordered_ind - - for i, ind in enumerate(ind_order): - order_map[i] = ind - filter_list = [order_map[j] for j in range(i)] - mask[ind, filter_list] = 1 #hier.dendrogram_col.reordered_ind[:i] - - # This may not work if we have a perfect interaction between two cells - Intended to avoid diagonal in distance matrix - # exclude_diag = distance_matrix.values[distance_matrix.values != 0] + linkage = hc.linkage(D, method=method, optimal_ordering=optimal_leaf) - kwargs_ = kwargs.copy() + # Plot hierarchical clustering + hier = sns.clustermap(df, + col_linkage=linkage, + row_linkage=linkage, + **kwargs + ) + plt.close() - # This one does exclude only diagonal - #exclude_diag = df.values[~np.eye(df.values.shape[0], dtype=bool)].reshape(df.values.shape[0], -1) - #if 'vmin' not in list(kwargs_.keys()): - # vmin = np.nanmin(exclude_diag) - # kwargs_['vmin'] = vmin - #if 'vmax' not in list(kwargs_.keys()): - # vmax = np.nanmax(exclude_diag) - # kwargs_['vmax'] = vmax + # Triangular matrix + mask = np.zeros((df.shape[0], df.shape[1])) + order_map = dict() + if linkage is None: + ind_order = range(hier.data2d.shape[0]) + else: + ind_order = hier.dendrogram_col.reordered_ind + + for i, ind in enumerate(ind_order): + order_map[i] = ind + filter_list = [order_map[j] for j in range(i)] + mask[ind, filter_list] = 1 #hier.dendrogram_col.reordered_ind[:i] + + kwargs_ = kwargs.copy() + # This one does exclude only diagonal + #exclude_diag = df.values[~np.eye(df.values.shape[0], dtype=bool)].reshape(df.values.shape[0], -1) + #if 'vmin' not in list(kwargs_.keys()): + # vmin = np.nanmin(exclude_diag) + # kwargs_['vmin'] = vmin + #if 'vmax' not in list(kwargs_.keys()): + # vmax = np.nanmax(exclude_diag) + # kwargs_['vmax'] = vmax + else: + linkage = None + mask = None + kwargs_ = kwargs.copy() # PLOT CCI MATRIX if space_type == 'class': @@ -121,12 +125,18 @@ def clustermap_cci(interaction_space, method='ward', optimal_leaf =True, metadat col_colors.index = meta_.index col_colors.name = group_col.capitalize() + if not symmetric: + row_colors = col_colors + else: + row_colors = None + # Plot hierarchical clustering (triangular) hier = sns.clustermap(df, col_linkage=linkage, row_linkage=linkage, mask=mask, col_colors=col_colors, + row_colors=row_colors, **kwargs_ ) else: @@ -137,50 +147,57 @@ def clustermap_cci(interaction_space, method='ward', optimal_leaf =True, metadat **kwargs_ ) - hier.ax_row_dendrogram.set_visible(False) - hier.ax_heatmap.tick_params(bottom=False) # Hide xtick line - # Apply offset transform to all xticklabels. - labels_x = hier.ax_heatmap.xaxis.get_majorticklabels() - labels_y = hier.ax_heatmap.yaxis.get_majorticklabels() - - label_x1 = labels_x[0] - label_y1 = labels_y[0] - - pos_x0 = label_x1.get_transform().transform((0., 0.)) - xlims = hier.ax_heatmap.get_xlim() - xrange = abs(xlims[0] - xlims[1]) - x_cell_width = xrange / len(df.columns) - - pos_y1 = label_y1.get_transform().transform(label_y1.get_position()) - ylims = hier.ax_heatmap.get_ylim() - yrange = abs(ylims[0] - ylims[1]) - y_cell_width = yrange / len(df.index) - - dpi_x = hier.fig.dpi_scale_trans.to_values()[0] - dpi_y = hier.fig.dpi_scale_trans.to_values()[3] - - for i, label in enumerate(labels_x): - if label.get_position()[0] % x_cell_width == 0: - pos_x1 = label.get_transform().transform((aux_scaler_x, aux_scaler_x)) - scaler_x = abs(pos_x1 - pos_x0) - else: - pos_x1 = label.get_transform().transform((x_cell_width, x_cell_width)) - scaler_x = abs(pos_x1 - pos_x0) - - aux_scaler_x = label.get_position()[0] - (label.get_position()[0] // x_cell_width) * x_cell_width - aux_scaler_y = label.get_position()[1] - (label.get_position()[1] // y_cell_width) * y_cell_width - - new_pos_x = label.get_position()[0] - aux_scaler_x - new_pos_y = new_pos_x * yrange / xrange - yrange + y_cell_width - - new_pos_y = label_y1.get_transform().transform((0, new_pos_y + aux_scaler_y)) - pos_y1 - dx = -0.5 * scaler_x[0] / dpi_x - dy = new_pos_y[1] / dpi_y - offset = mlp.transforms.ScaledTranslation(dx, dy, hier.fig.dpi_scale_trans) - label.set_transform(label.get_transform() + offset) - - hier.ax_heatmap.set_xticklabels(hier.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, ha='right')#, fontsize=9.5) + if symmetric: + hier.ax_row_dendrogram.set_visible(False) + hier.ax_heatmap.tick_params(bottom=False) # Hide xtick line + + labels_x = hier.ax_heatmap.xaxis.get_majorticklabels() + labels_y = hier.ax_heatmap.yaxis.get_majorticklabels() + + label_x1 = labels_x[0] + label_y1 = labels_y[0] + + pos_x0 = label_x1.get_transform().transform((0., 0.)) + xlims = hier.ax_heatmap.get_xlim() + xrange = abs(xlims[0] - xlims[1]) + x_cell_width = xrange / len(df.columns) + + pos_y1 = label_y1.get_transform().transform(label_y1.get_position()) + ylims = hier.ax_heatmap.get_ylim() + yrange = abs(ylims[0] - ylims[1]) + y_cell_width = yrange / len(df.index) + + dpi_x = hier.fig.dpi_scale_trans.to_values()[0] + dpi_y = hier.fig.dpi_scale_trans.to_values()[3] + + for i, label in enumerate(labels_x): + if label.get_position()[0] % x_cell_width == 0: + pos_x1 = label.get_transform().transform((aux_scaler_x, aux_scaler_x)) + scaler_x = abs(pos_x1 - pos_x0) + else: + pos_x1 = label.get_transform().transform((x_cell_width, x_cell_width)) + scaler_x = abs(pos_x1 - pos_x0) + + aux_scaler_x = label.get_position()[0] - (label.get_position()[0] // x_cell_width) * x_cell_width + aux_scaler_y = label.get_position()[1] - (label.get_position()[1] // y_cell_width) * y_cell_width + + new_pos_x = label.get_position()[0] - aux_scaler_x + new_pos_y = new_pos_x * yrange / xrange - yrange + y_cell_width + + new_pos_y = label_y1.get_transform().transform((0, new_pos_y + aux_scaler_y)) - pos_y1 + dx = -0.5 * scaler_x[0] / dpi_x + dy = new_pos_y[1] / dpi_y + offset = mlp.transforms.ScaledTranslation(dx, dy, hier.fig.dpi_scale_trans) + label.set_transform(label.get_transform() + offset) + + if symmetric: + rot = 45 + ha = 'right' + else: + rot = 90 + ha = 'center' + hier.ax_heatmap.set_xticklabels(hier.ax_heatmap.xaxis.get_majorticklabels(), rotation=rot, ha=ha)#, fontsize=9.5) hier.ax_heatmap.set_yticklabels(hier.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, ha='left') # , fontsize=9.5) # Title @@ -263,7 +280,7 @@ def pcoa_biplot(interaction_space, metadata, sample_col='#SampleID', group_col=' ax.set_zticklabels([]) ax.view_init(view_angles[0], view_angles[1]) - ax.legend(loc='center left', bbox_to_anchor=(1.15, 0.5), + ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), ncol=2, fancybox=True, shadow=True, fontsize=legend_size) plt.title(title, fontsize=16) @@ -278,12 +295,11 @@ def pcoa_biplot(interaction_space, metadata, sample_col='#SampleID', group_col=' return results - -def clustermap_cell_pairs_vs_ppi(ppi_score_for_cell_pairs, metadata=None, sample_col='#SampleID', group_col='Groups', - meta_cmap='gist_rainbow', colors=None, cell_labels=('LEFT-CELL','RIGHT-CELL'), - metric='jaccard', method='ward', optimal_leaf=True, excluded_cells=None, title='', - cbar_fontsize=12, filename=None, **kwargs): - df_ = ppi_score_for_cell_pairs.copy() +def clustermap_ccc(interaction_space, metadata=None, sample_col='#SampleID', group_col='Groups', + meta_cmap='gist_rainbow', colors=None, cell_labels=('SENDER-CELL','RECEIVER-CELL'), + metric='jaccard', method='ward', optimal_leaf=True, excluded_cells=None, title='', + cbar_fontsize=12, filename=None, **kwargs): + df_ = interaction_space.communication_matrix.copy() if excluded_cells is not None: included_cells = [] From d15e8b7087426039247ba813f585f071de8d7860 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:46:12 -0700 Subject: [PATCH 11/19] Adapted to new code structure --- cell2cell/utils/plotting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cell2cell/utils/plotting.py b/cell2cell/utils/plotting.py index f37f601..604064b 100644 --- a/cell2cell/utils/plotting.py +++ b/cell2cell/utils/plotting.py @@ -53,7 +53,6 @@ def clustermap_cci(interaction_space, method='ward', optimal_leaf=True, metadata # Check symmetry to run linkage separately symmetric = (df.values.transpose() == df.values).all() if symmetric: - # TODO: implement heatmap when distance matrix is not symmetric # Compute linkage D = sp.distance.squareform(df) if 'col_cluster' in kwargs.keys(): From ec4c7e6581a2dceb08e4d2d00875e7e63725dad2 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:46:39 -0700 Subject: [PATCH 12/19] Minor changes --- cell2cell/preprocessing/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py index 3d2445e..a04c12f 100644 --- a/cell2cell/preprocessing/__init__.py +++ b/cell2cell/preprocessing/__init__.py @@ -6,7 +6,7 @@ get_local_percentile_cutoffs) from cell2cell.preprocessing.gene_ontology import (find_all_children_of_go_term, find_go_terms_from_keyword, get_genes_from_go_hierarchy, get_genes_from_go_terms) -from cell2cell.preprocessing.integrate_data import (get_binary_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms, +from cell2cell.preprocessing.integrate_data import (get_thresholded_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms, get_ppi_dict_from_proteins, get_weighted_ppi) from cell2cell.preprocessing.ppi import (bidirectional_ppi_for_cci, filter_ppi_by_proteins, filter_ppi_network, get_all_to_all_ppi, get_filtered_ppi_network, get_one_group_to_other_ppi, From 41ee81c7bc8e9a5a340616d9fbcd678acccf4242 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 17 Apr 2020 18:47:03 -0700 Subject: [PATCH 13/19] New code structure --- cell2cell/analysis/ppi_enrichment.py | 105 --------------------------- cell2cell/core/__init__.py | 1 + cell2cell/utils/__init__.py | 4 +- cell2cell/utils/networks.py | 2 + 4 files changed, 5 insertions(+), 107 deletions(-) delete mode 100644 cell2cell/analysis/ppi_enrichment.py diff --git a/cell2cell/analysis/ppi_enrichment.py b/cell2cell/analysis/ppi_enrichment.py deleted file mode 100644 index e2a1f77..0000000 --- a/cell2cell/analysis/ppi_enrichment.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import itertools -import functools -import numpy as np -import pandas as pd - - -def ppi_count_operation(pair, subsampled_interactions, use_ppi_score=False, index=None): - # TODO make sure that index from merging ppi_data & ref_ppi_data matches after using .values[index] - cell1 = pair[0] - cell2 = pair[1] - - if use_ppi_score: - assert np.array_equal(subsampled_interactions[0]['ppis'][cell1][['score']].values, - subsampled_interactions[0]['ppis'][cell2][['score']].values) - - if index is None: - if use_ppi_score: - matrices = [np.multiply(element['ppis'][cell1][['A']].values, element['ppis'][cell2][['B']].values) \ - * element['ppis'][cell1][['score']].values for element in subsampled_interactions] - else: - matrices = [np.multiply(element['ppis'][cell1][['A']].values, element['ppis'][cell2][['B']].values) - for element in subsampled_interactions] - - else: - if use_ppi_score: - matrices = [ - np.multiply(element['ppis'][cell1][['A']].values[index], element['ppis'][cell2][['B']].values[index]) \ - * element['ppis'][cell1][['score']].values[index] for element in subsampled_interactions] - else: - matrices = [np.multiply(element['ppis'][cell1][['A']].values[index], element['ppis'][cell2][['B']].values[index]) - for element in subsampled_interactions] - matrices = np.concatenate(matrices, axis=1) - return matrices - - -def compute_avg_count_for_ppis(subsampled_interactions, clusters, ppi_data): - index = ppi_data.apply(lambda row: tuple(sorted([row[0], row[1]])), axis=1) - mean_matrix = pd.DataFrame(columns=list(clusters.keys()), - index=index) - - std_matrix = pd.DataFrame(columns=list(clusters.keys()), - index=index) - - for cluster, cells in clusters.items(): - cell_pairs = list(itertools.combinations(cells + cells, 2)) - cell_pairs = list(set(cell_pairs)) # Remove duplicates - - matrices = list(map(functools.partial(ppi_count_operation, subsampled_interactions=subsampled_interactions), - cell_pairs)) - - matrices = np.concatenate(matrices, axis=1) - mean_column = np.nanmean(matrices, axis=1) - std_column = np.nanstd(matrices, axis=1) - - mean_matrix[cluster] = mean_column - std_matrix[cluster] = std_column - - mean_matrix = mean_matrix.dropna(how='all') - mean_matrix = mean_matrix.fillna(0.0) - mean_matrix = mean_matrix.groupby(level=0).sum() / 2.0 - - std_matrix = std_matrix.dropna(how='all') - std_matrix = std_matrix.fillna(0.0) - std_matrix = std_matrix.groupby(level=0).sum() / 2.0 - return mean_matrix, std_matrix - - -def get_ppi_score_for_cell_pairs(cells, subsampled_interactions, ppi_data, use_ppi_score=False, ref_ppi_data=None): - cell_pairs = list(itertools.combinations(cells + cells, 2)) - cell_pairs = list(set(cell_pairs)) - col_labels = ['{};{}'.format(pair[0], pair[1]) for pair in cell_pairs] - if ref_ppi_data is None: - index = ppi_data.apply(lambda row: (row[0], row[1]), axis=1) - keep_index=None - mean_matrix = pd.DataFrame(index=index, columns=col_labels) - std_matrix = pd.DataFrame(index=index, columns=col_labels) - else: - ref_index = list(ref_ppi_data.apply(lambda row: (row[0], row[1]), axis=1).values) - keep_index = list(pd.merge(ppi_data, ref_ppi_data, how='inner').index) - mean_matrix = pd.DataFrame(index=ref_index, columns=col_labels) - std_matrix = pd.DataFrame(index=ref_index, columns=col_labels) - - - for i, pair in enumerate(cell_pairs): - matrices = ppi_count_operation(pair=pair, - subsampled_interactions=subsampled_interactions, - use_ppi_score=use_ppi_score, - index=keep_index) - - mean_column = np.nanmean(matrices, axis=1) - std_column = np.nanstd(matrices, axis=1) - - mean_matrix[col_labels[i]] = mean_column - std_matrix[col_labels[i]] = std_column - - mean_matrix = mean_matrix.dropna(how='all') - mean_matrix = mean_matrix.fillna(0.0) - - std_matrix = std_matrix.loc[mean_matrix.index, mean_matrix.columns] - std_matrix = std_matrix.fillna(0.0) - return mean_matrix, std_matrix \ No newline at end of file diff --git a/cell2cell/core/__init__.py b/cell2cell/core/__init__.py index 190f963..fdceb08 100644 --- a/cell2cell/core/__init__.py +++ b/cell2cell/core/__init__.py @@ -5,5 +5,6 @@ from cell2cell.core.cci_scores import (compute_braycurtis_like_cci_score, compute_count_score, compute_jaccard_like_cci_score, matmul_bray_curtis_like, matmul_count_active, matmul_jaccard_like) from cell2cell.core.cell import (Cell, get_cells_from_rnaseq) +from cell2cell.core.communication_scores import (get_binary_scores, get_continuous_scores) from cell2cell.core.interaction_space import (generate_interaction_elements, InteractionSpace) from cell2cell.core.spatial_interactions import (SpatialCCI) \ No newline at end of file diff --git a/cell2cell/utils/__init__.py b/cell2cell/utils/__init__.py index 4ba5ad4..a80c98d 100644 --- a/cell2cell/utils/__init__.py +++ b/cell2cell/utils/__init__.py @@ -3,5 +3,5 @@ from __future__ import absolute_import from cell2cell.utils.networks import (generate_network_from_adjacency, export_network_to_gephi) -from cell2cell.utils.parallel_computing import (agents_number, parallel_subsampling_interactions) -from cell2cell.utils.plotting import (clustermap_cci, clustermap_cell_pairs_vs_ppi, get_colors_from_labels, pcoa_biplot) +from cell2cell.utils.parallel_computing import (agents_number) +from cell2cell.utils.plotting import (clustermap_cci, clustermap_ccc, get_colors_from_labels, pcoa_biplot) diff --git a/cell2cell/utils/networks.py b/cell2cell/utils/networks.py index 0bdcfaa..068a0eb 100644 --- a/cell2cell/utils/networks.py +++ b/cell2cell/utils/networks.py @@ -4,6 +4,8 @@ import networkx as nx +# TODO: Add an export function for Cytoscape + def generate_network_from_adjacency(adjacency_matrix, package='networkx'): if package == 'networkx': network = nx.from_pandas_adjacency(adjacency_matrix) From 059fc352775ad8391c99c52dc7383a1fddf2208f Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 00:57:29 -0700 Subject: [PATCH 14/19] Added functions to manipulate dataframes --- cell2cell/preprocessing/__init__.py | 2 + .../preprocessing/manipulate_dataframes.py | 51 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 cell2cell/preprocessing/manipulate_dataframes.py diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py index a04c12f..41336c1 100644 --- a/cell2cell/preprocessing/__init__.py +++ b/cell2cell/preprocessing/__init__.py @@ -8,6 +8,8 @@ get_genes_from_go_hierarchy, get_genes_from_go_terms) from cell2cell.preprocessing.integrate_data import (get_thresholded_rnaseq, get_modified_rnaseq, get_ppi_dict_from_go_terms, get_ppi_dict_from_proteins, get_weighted_ppi) +from cell2cell.preprocessing.manipulate_dataframes import (check_presence_in_dataframe, shuffle_cols_in_df, shuffle_dataframe, + subsample_dataframe) from cell2cell.preprocessing.ppi import (bidirectional_ppi_for_cci, filter_ppi_by_proteins, filter_ppi_network, get_all_to_all_ppi, get_filtered_ppi_network, get_one_group_to_other_ppi, remove_ppi_bidirectionality, simplify_ppi) diff --git a/cell2cell/preprocessing/manipulate_dataframes.py b/cell2cell/preprocessing/manipulate_dataframes.py new file mode 100644 index 0000000..c41f0af --- /dev/null +++ b/cell2cell/preprocessing/manipulate_dataframes.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import random +import numpy as np +import pandas as pd + + +def check_presence_in_dataframe(df, elements, columns=None): + if columns is None: + columns = list(df.columns) + df_elements = pd.Series(np.unique(df[columns].values.flatten())) + df_elements = df_elements.loc[df_elements.isin(elements)].values + return list(df_elements) + + +def shuffle_cols_in_df(df, columns, shuffling_number=1, random_state=None): + df_ = df.copy() + if isinstance(columns, str): + columns = [columns] + + for col in columns: + for i in range(shuffling_number): + if random_state is not None: + np.random.seed(random_state + i) + df_[col] = np.random.permutation(df_[col].values) + return df_ + + +def shuffle_dataframe(df, shuffling_number=1, axis=0, random_state=None): + df_ = df.copy() + axis = int(not axis) # pandas.DataFrame is always 2D + for _ in range(shuffling_number): + for i, view in enumerate(np.rollaxis(df_.values, axis)): + if random_state is not None: + np.random.seed(random_state + i) + np.random.shuffle(view) + return df_ + + +def subsample_dataframe(df, n_samples, random_state=None): + items = list(df.index) + if n_samples > len(items): + n_samples = len(items) + if isinstance(random_state, int): + random.seed(random_state) + random.shuffle(items) + + subsampled_df = df.loc[items[:n_samples],:] + return subsampled_df From 55c4d1cef647ec4aa81d123f922c3b3f4fbbd857 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 00:57:58 -0700 Subject: [PATCH 15/19] Re-organized functions for permuting PPI lists --- cell2cell/stats/__init__.py | 4 +- cell2cell/stats/null_distribution.py | 189 --------------------------- cell2cell/stats/permutation.py | 108 +++++++++++++++ 3 files changed, 109 insertions(+), 192 deletions(-) delete mode 100644 cell2cell/stats/null_distribution.py create mode 100644 cell2cell/stats/permutation.py diff --git a/cell2cell/stats/__init__.py b/cell2cell/stats/__init__.py index 4183d6f..d840bca 100644 --- a/cell2cell/stats/__init__.py +++ b/cell2cell/stats/__init__.py @@ -3,6 +3,4 @@ from __future__ import absolute_import from cell2cell.stats.enrichment import (hypergeom_representation) -from cell2cell.stats.null_distribution import (filter_null_ppi_network, filter_random_subsample_ppi_network, - check_presence_in_dataframe, get_null_ppi_network, random_switching_ppi_labels, - shuffle_dataframe, subsample_dataframe) +from cell2cell.stats.permutation import (pvalue_from_dist, random_switching_ppi_labels) diff --git a/cell2cell/stats/null_distribution.py b/cell2cell/stats/null_distribution.py deleted file mode 100644 index ba7ae70..0000000 --- a/cell2cell/stats/null_distribution.py +++ /dev/null @@ -1,189 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import random -import numpy as np -import pandas as pd - -from sklearn.utils import resample, shuffle - -from cell2cell.preprocessing import ppi - - -### FUNCTIONS FOR GENERATING DIFFERENT KIND OF NULL DISTIBUTIONS ### -def filter_null_ppi_network(ppi_data, contact_proteins, mediator_proteins=None, interaction_type='contacts', - interaction_columns=['A', 'B'], randomized_col=0, random_state=None, verbose=True): - ''' - This function - ''' - - new_ppi_data = get_null_ppi_network(ppi_data=ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type=interaction_type, - interaction_columns=interaction_columns, - randomized_col=randomized_col, - random_state=random_state, - verbose=verbose) - - new_ppi_data = ppi.bidirectional_ppi_for_cci(ppi_data=new_ppi_data, - interaction_columns=interaction_columns, - verbose=verbose) - return new_ppi_data - - -def filter_random_subsample_ppi_network(original_ppi_data, contact_proteins, mediator_proteins=None, - interaction_type='contacts', random_ppi_data=None, interaction_columns=['A', 'B'], - random_state=None, verbose=True): - ''' - This function randomly subsample a ppi network, if a random_ppi_data is provided, it is subsampled instead. - ''' - - new_ppi_data = ppi.get_filtered_ppi_network(ppi_data=original_ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type=interaction_type, - interaction_columns=interaction_columns, - verbose=verbose) - - if random_ppi_data is not None: - new_random = ppi.get_filtered_ppi_network(ppi_data=random_ppi_data, - contact_proteins=contact_proteins, - mediator_proteins=mediator_proteins, - interaction_type=interaction_type, - interaction_columns=interaction_columns, - verbose=verbose) - - new_ppi_data = subsample_dataframe(df=new_random, - n_samples=new_ppi_data.shape[0], - random_state=random_state) - else: - new_ppi_data = shuffle_dataframe(df=new_ppi_data, - shuffling_number=1, - axis=0, random_state=random_state) - #new_ppi_data = subsample_dataframe(df=new_ppi_data, - # n_samples=new_ppi_data.shape[0], - # random_state=random_state) - - new_ppi_data = ppi.bidirectional_ppi_for_cci(ppi_data=new_ppi_data, - interaction_columns=interaction_columns, - verbose=verbose) - return new_ppi_data - - -def get_null_ppi_network(ppi_data, contact_proteins, mediator_proteins=None, interaction_type='contacts', - interaction_columns=['A', 'B'], randomized_col=0, random_state=None, verbose=True): - if (mediator_proteins is None) and (interaction_type != 'contacts'): - raise ValueError("mediator_proteins cannot be None when interaction_type is not contacts") - - if verbose: - print('Filtering PPI interactions by using a list of genes for {} interactions'.format(interaction_type)) - if interaction_type == 'contacts': - new_ppi_data = ppi.get_all_to_all_ppi(ppi_data=ppi_data, - proteins=contact_proteins, - interaction_columns=interaction_columns) - - in_contacts = check_presence_in_dataframe(df=new_ppi_data, - elements=contact_proteins, - columns=interaction_columns) - - new_ppi_data[interaction_columns[randomized_col]] = resample(in_contacts, - n_samples=new_ppi_data.shape[0], - random_state=random_state) - elif interaction_type == 'complete': - total_proteins = list(set(contact_proteins + mediator_proteins)) - - new_ppi_data = ppi.get_all_to_all_ppi(ppi_data=ppi_data, - proteins=total_proteins, - interaction_columns=interaction_columns) - - in_total = check_presence_in_dataframe(df=new_ppi_data, - elements=total_proteins, - columns=interaction_columns) - - new_ppi_data[interaction_columns[randomized_col]] = resample(in_total, - n_samples=new_ppi_data.shape[0], - random_state=random_state) - - else: - # All the following interactions incorporate contacts-mediator interactions - mediated = ppi.get_one_group_to_other_ppi(ppi_data=ppi_data, - proteins_a=contact_proteins, - proteins_b=mediator_proteins, - interaction_columns=interaction_columns) - - in_mediated = check_presence_in_dataframe(df=mediated, - elements=mediator_proteins, - columns=interaction_columns) - - mediated[interaction_columns[randomized_col]] = resample(in_mediated, - n_samples=mediated.shape[0], - random_state=random_state) - - if interaction_type == 'mediated': - new_ppi_data = mediated - elif interaction_type == 'combined': - contacts = ppi.get_all_to_all_ppi(ppi_data=ppi_data, - proteins=contact_proteins, - interaction_columns=interaction_columns) - - in_contacts = check_presence_in_dataframe(df=contacts, - elements=contact_proteins, - columns=interaction_columns) - - contacts[interaction_columns[randomized_col]] = resample(in_contacts, - n_samples=contacts.shape[0], - random_state=random_state) - - new_ppi_data = pd.concat([contacts, mediated], ignore_index=True).drop_duplicates() - else: - raise NameError('Not valid interaction type to filter the PPI network') - new_ppi_data.reset_index(inplace=True, drop=True) - return new_ppi_data - - -def check_presence_in_dataframe(df, elements, columns=None): - if columns is None: - columns = list(df.columns) - df_elements = pd.Series(np.unique(df[columns].values.flatten())) - df_elements = df_elements.loc[df_elements.isin(elements)].values - return list(df_elements) - - -def subsample_dataframe(df, n_samples, random_state=None): - items = list(df.index) - if n_samples > len(items): - n_samples = len(items) - if isinstance(random_state, int): - random.seed(random_state) - random.shuffle(items) - - subsampled_df = df.loc[items[:n_samples],:] - return subsampled_df - - -def random_switching_ppi_labels(interaction_type, ppi_dict, genes=None, random_state=None, interaction_columns=['A', 'B']): - ppi_dict_ = dict() - ppi_dict_[interaction_type] = ppi_dict[interaction_type].copy() - if genes is None: - genes = list(np.unique(ppi_dict_[interaction_type][interaction_columns].values.flatten())) - else: - genes = list(set(genes)) - mapper = dict(zip(genes, shuffle(genes, random_state=random_state))) - A = interaction_columns[0] - B = interaction_columns[1] - ppi_dict_[interaction_type][A] = ppi_dict_[interaction_type][A].apply(lambda x: mapper[x]) - ppi_dict_[interaction_type][B] = ppi_dict_[interaction_type][B].apply(lambda x: mapper[x]) - return mapper, ppi_dict_ - - -def shuffle_dataframe(df, shuffling_number=1, axis=0, random_state=None): - df_ = df.copy() - axis = int(not axis) # pandas.DataFrame is always 2D - for _ in range(shuffling_number): - for i, view in enumerate(np.rollaxis(df_.values, axis)): - if random_state is not None: - np.random.seed(random_state + i) - np.random.shuffle(view) - return df_ \ No newline at end of file diff --git a/cell2cell/stats/permutation.py b/cell2cell/stats/permutation.py new file mode 100644 index 0000000..e44aab6 --- /dev/null +++ b/cell2cell/stats/permutation.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import scipy +import numpy as np + +import matplotlib.pyplot as plt +import seaborn as sns + +from sklearn.utils import shuffle + + +def pvalue_from_dist(obs_value, dist, label='', comparison='upper'): + ''' + This function computes a p-value for an observed value given a simulated or empirical distribution. + It plots the distribution and prints the p-value. + + Parameters + ---------- + obs_value : float + Observed value used to get a p-value from a distribution. + + dist : array-like + Simulated/Empirical distribution used to compare the observed value and get a p-value + + label : str, '' by default + Label used for the histogram plot. + + comparison : str, 'upper' by default + Type of comparison used to get the p-value. It could be 'lower', 'upper' or 'different'. + Lower and upper correspond to one-tailed analysis, while different is for a two-tailed analysis. + + Returns + ------- + fig : matplotlib.pyplot.figure + Figure that shows the histogram for dist. + + pval : float + P-value obtained from both observed value and pre-computed distribution. + ''' + if comparison == 'lower': + pval = scipy.stats.percentileofscore(dist, obs_value) / 100.0 + elif comparison == 'upper': + pval = 1.0 - scipy.stats.percentileofscore(dist, obs_value) / 100.0 + elif comparison == 'different': + percentile = scipy.stats.percentileofscore(dist, obs_value) / 100.0 + if percentile <= 0.5: + pval = 2.0 * percentile + else: + pval = 2.0 * (1.0 - percentile) + else: + raise NotImplementedError('Comparison {} is not implemented'.format(comparison)) + + print('P-value is: {}'.format(pval)) + + with sns.axes_style("darkgrid"): + if label == '': + if pval > 1. - 1. / len(dist): + label_ = 'p-val: >{:g}'.format(float('{:.1g}'.format(1. - 1. / len(dist)))) + elif pval < 1. / len(dist): + label_ = 'p-val: <{:g}'.format(float('{:.1g}'.format(1. / len(dist)))) + else: + label_ = 'p-val: {0:.2E}'.format(pval) + else: + if pval > 1. - 1. / len(dist): + label_ = label + ' - p-val: >{:g}'.format(float('{:.1g}'.format(1. - 1. / len(dist)))) + elif pval < 1. / len(dist): + label_ = label + ' - p-val: <{:g}'.format(float('{:.1g}'.format(1. / len(dist)))) + else: + label_ = label + ' - p-val: {0:.2E}'.format(pval) + fig = sns.distplot(dist, hist=True, kde=True, norm_hist=False, rug=False, label=label_) + fig.axvline(x=obs_value, color=fig.get_lines()[-1].get_c(), ls='--') + + fig.tick_params(axis='both', which='major', labelsize=16) + lgd = plt.legend(loc='center left', bbox_to_anchor=(1.01, 0.5), + ncol=1, fancybox=True, shadow=True, fontsize=14) + return fig, pval + + +def random_switching_ppi_labels(ppi_data, genes=None, random_state=None, interaction_columns=['A', 'B'], column='both'): + ppi_data_ = ppi_data.copy() + A = interaction_columns[0] + B = interaction_columns[1] + if column == 'both': + if genes is None: + genes = list(np.unique(ppi_data_[interaction_columns].values.flatten())) + else: + genes = list(set(genes)) + mapper = dict(zip(genes, shuffle(genes, random_state=random_state))) + ppi_data_[A] = ppi_data_[A].apply(lambda x: mapper[x]) + ppi_data_[B] = ppi_data_[B].apply(lambda x: mapper[x]) + elif column == 'first': + if genes is None: + genes = list(np.unique(ppi_data_[A].values.flatten())) + else: + genes = list(set(genes)) + mapper = dict(zip(genes, shuffle(genes, random_state=random_state))) + ppi_data_[A] = ppi_data_[A].apply(lambda x: mapper[x]) + elif column == 'second': + if genes is None: + genes = list(np.unique(ppi_data_[B].values.flatten())) + else: + genes = list(set(genes)) + mapper = dict(zip(genes, shuffle(genes, random_state=random_state))) + ppi_data_[B] = ppi_data_[B].apply(lambda x: mapper[x]) + else: raise ValueError('Not valid option') + return ppi_data_ \ No newline at end of file From 0a5a5f06912d10e73d7957e35a5f0788cf091a68 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 00:58:22 -0700 Subject: [PATCH 16/19] Minor changes --- cell2cell/utils/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cell2cell/utils/plotting.py b/cell2cell/utils/plotting.py index 604064b..8e4bef6 100644 --- a/cell2cell/utils/plotting.py +++ b/cell2cell/utils/plotting.py @@ -298,7 +298,7 @@ def clustermap_ccc(interaction_space, metadata=None, sample_col='#SampleID', gro meta_cmap='gist_rainbow', colors=None, cell_labels=('SENDER-CELL','RECEIVER-CELL'), metric='jaccard', method='ward', optimal_leaf=True, excluded_cells=None, title='', cbar_fontsize=12, filename=None, **kwargs): - df_ = interaction_space.communication_matrix.copy() + df_ = interaction_space.interaction_elements['communication_matrix'].copy() if excluded_cells is not None: included_cells = [] From ddcf648f9e78b2121c35c744b8214083e7e8f718 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 00:58:35 -0700 Subject: [PATCH 17/19] Minor changes --- cell2cell/core/interaction_space.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cell2cell/core/interaction_space.py b/cell2cell/core/interaction_space.py index d87d01a..51792c5 100644 --- a/cell2cell/core/interaction_space.py +++ b/cell2cell/core/interaction_space.py @@ -243,7 +243,7 @@ def pair_communication_score(self, cell1, cell2, communication_score='expression return communication_value def compute_pairwise_communication_scores(self, communication_score=None, use_ppi_score=False, ref_ppi_data=None, - cells=None, verbose=True): + cells=None, cci_type=None, verbose=True): ''' Function that computes... @@ -261,12 +261,20 @@ def compute_pairwise_communication_scores(self, communication_score=None, use_pp else: assert isinstance(communication_score, str) - # Labels: + # Cells to consider if cells is None: + cells = self.interaction_elements['cell_names'] + + # Labels: + if cci_type is None: cell_pairs = self.interaction_elements['pairs'] + elif cci_type != self.cci_type: + cell_pairs = generate_pairs(cells, cci_type) else: cell_pairs = generate_pairs(cells, self.cci_type) col_labels = ['{};{}'.format(pair[0], pair[1]) for pair in cell_pairs] + + # Ref PPI data if ref_ppi_data is None: ref_index = self.ppi_data.apply(lambda row: (row[0], row[1]), axis=1) keep_index = list(range(self.ppi_data.shape[0])) @@ -292,4 +300,4 @@ def compute_pairwise_communication_scores(self, communication_score=None, use_pp verbose=verbose) communication_matrix[col_labels[i]] = comm_score.flatten()[keep_index] - self.communication_matrix = communication_matrix \ No newline at end of file + self.interaction_elements['communication_matrix'] = communication_matrix \ No newline at end of file From 54b1c8946d2de6bc131aec6ac9747a6941ae8831 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 00:59:28 -0700 Subject: [PATCH 18/19] Moved functions to study PPI enrichment into Communication Scores in InteractionSpace --- cell2cell/analysis/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cell2cell/analysis/__init__.py b/cell2cell/analysis/__init__.py index fd27f7b..b6c3f85 100644 --- a/cell2cell/analysis/__init__.py +++ b/cell2cell/analysis/__init__.py @@ -3,6 +3,5 @@ from __future__ import absolute_import from cell2cell.analysis.pipelines import (core_pipeline, heuristic_pipeline, ligand_receptor_pipeline) -from cell2cell.analysis.ppi_enrichment import (compute_avg_count_for_ppis, ppi_count_operation, get_ppi_score_for_cell_pairs) # from cell2cell.analysis.func_enrichment import * From 8a209640a1d261590055d9acfb4c7f799169c415 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Sun, 26 Apr 2020 01:00:16 -0700 Subject: [PATCH 19/19] Run PCoA only if CCI matrix is symetric ('undirected' interactions) --- cell2cell/analysis/pipelines.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/cell2cell/analysis/pipelines.py b/cell2cell/analysis/pipelines.py index 519c57a..d950571 100644 --- a/cell2cell/analysis/pipelines.py +++ b/cell2cell/analysis/pipelines.py @@ -52,15 +52,19 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set **{'cmap': 'Blues'} ) - pcoa = plotting.pcoa_biplot(interaction_space, - excluded_cells=excluded_cells, - metadata=metadata, - sample_col=meta_setup['sample_col'], - group_col=meta_setup['group_col'], - colors=colors, - title='PCoA for cells given their CCI scores', - filename=files['output_folder'] + 'CCI-PCoA-CCI-scores{}.png'.format(filename_suffix), - ) + # Run PCoA only if CCI matrix is symmetric + pcoa_state = False + if (interaction_space.interaction_elements['cci_matrix'].values.transpose() == interaction_space.interaction_elements['cci_matrix'].values).all(): + pcoa = plotting.pcoa_biplot(interaction_space, + excluded_cells=excluded_cells, + metadata=metadata, + sample_col=meta_setup['sample_col'], + group_col=meta_setup['group_col'], + colors=colors, + title='PCoA for cells given their CCI scores', + filename=files['output_folder'] + 'CCI-PCoA-CCI-scores{}.png'.format(filename_suffix), + ) + pcoa_state = True interaction_clustermap = plotting.clustermap_ccc(interaction_space, metric=metric, @@ -79,7 +83,8 @@ def core_pipeline(files, rnaseq_data, ppi_data, metadata, meta_setup, cutoff_set outputs = dict() outputs['interaction_space'] = interaction_space outputs['clustermap'] = clustermap - outputs['pcoa'] = pcoa + if pcoa_state: + outputs['pcoa'] = pcoa outputs['interaction_clustermap'] = interaction_clustermap outputs['LR-pairs'] = interaction_clustermap.data2d return outputs