diff --git a/README.md b/README.md index bb2217e..f63d9f6 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ associated with Memory. This may happen when the tensor is big enough to make th *bioRxiv*, (2020). **DOI: 10.1101/2020.11.22.392217** -- **Tensor-cell2cell** should be cited using this pre-print in bioRxiv: +- **Tensor-cell2cell** should be cited using this research article: - Armingol E., Baghdassarian H., Martino C., Perez-Lopez A., Aamodt C., Knight R., Lewis N.E. - [Context-aware deconvolution of cell-cell communication with Tensor-cell2cell](https://doi.org/10.1101/2021.09.20.461129) - *bioRxiv*, (2021). **DOI: 10.1101/2021.09.20.461129** \ No newline at end of file + [Context-aware deconvolution of cell-cell communication with Tensor-cell2cell](https://doi.org/10.1038/s41467-022-31369-2) + *Nat. Commun.* **13**, 3665 (2022). **DOI: 10.1038/s41467-022-31369-2** \ No newline at end of file diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py index 5d832b9..6315850 100644 --- a/cell2cell/__init__.py +++ b/cell2cell/__init__.py @@ -14,4 +14,4 @@ from cell2cell import tensor from cell2cell import utils -__version__ = "0.5.10" \ No newline at end of file +__version__ = "0.5.11" \ No newline at end of file diff --git a/cell2cell/analysis/pipelines.py b/cell2cell/analysis/pipelines.py index f73f1ae..ba3aa94 100644 --- a/cell2cell/analysis/pipelines.py +++ b/cell2cell/analysis/pipelines.py @@ -87,6 +87,13 @@ class BulkInteractions: For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -115,6 +122,14 @@ class BulkInteractions: For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + + complex_agg_method : str + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + ref_ppi : pandas.DataFrame Reference list of protein-protein interactions (or ligand-receptor pairs) used for inferring the cell-cell interactions and communication. It could be the @@ -193,7 +208,8 @@ class BulkInteractions: ''' def __init__(self, rnaseq_data, ppi_data, metadata=None, interaction_columns=('A', 'B'), communication_score='expression_thresholding', cci_score='bray_curtis', cci_type='undirected', - sample_col='sampleID', group_col='tissue', expression_threshold=10, complex_sep=None, verbose=False): + sample_col='sampleID', group_col='tissue', expression_threshold=10, complex_sep=None, + complex_agg_method='min', verbose=False): # Placeholders self.rnaseq_data = rnaseq_data self.metadata = metadata @@ -202,6 +218,7 @@ def __init__(self, rnaseq_data, ppi_data, metadata=None, interaction_columns=('A self.analysis_setup = dict() self.cutoff_setup = dict() self.complex_sep = complex_sep + self.complex_agg_method = complex_agg_method self.interaction_columns = interaction_columns # Analysis setup @@ -238,6 +255,7 @@ def __init__(self, rnaseq_data, ppi_data, metadata=None, interaction_columns=('A cutoff_setup=self.cutoff_setup, analysis_setup=self.analysis_setup, complex_sep=complex_sep, + complex_agg_method=complex_agg_method, interaction_columns=self.interaction_columns, verbose=verbose) @@ -442,6 +460,13 @@ class SingleCellInteractions: For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + verbose : boolean, default=False Whether printing or not steps of the analysis. @@ -472,6 +497,13 @@ class SingleCellInteractions: For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + ref_ppi : pandas.DataFrame Reference list of protein-protein interactions (or ligand-receptor pairs) used for inferring the cell-cell interactions and communication. It could be the @@ -578,7 +610,7 @@ class SingleCellInteractions: def __init__(self, rnaseq_data, ppi_data, metadata, interaction_columns=('A', 'B'), communication_score='expression_thresholding', cci_score='bray_curtis', cci_type='undirected', expression_threshold=0.20, aggregation_method='nn_cell_fraction', barcode_col='barcodes', - celltype_col='cell_types', complex_sep=None, verbose=False): + celltype_col='cell_types', complex_sep=None, complex_agg_method='min', verbose=False): # Placeholders self.rnaseq_data = rnaseq_data self.metadata = metadata @@ -588,6 +620,7 @@ def __init__(self, rnaseq_data, ppi_data, metadata, interaction_columns=('A', 'B self.analysis_setup = dict() self.cutoff_setup = dict() self.complex_sep = complex_sep + self.complex_agg_method = complex_agg_method self.interaction_columns = interaction_columns self.ccc_permutation_pvalues = None self.cci_permutation_pvalues = None @@ -641,6 +674,7 @@ def __init__(self, rnaseq_data, ppi_data, metadata, interaction_columns=('A', 'B cutoff_setup=self.cutoff_setup, analysis_setup=self.analysis_setup, complex_sep=self.complex_sep, + complex_agg_method=self.complex_agg_method, interaction_columns=self.interaction_columns, verbose=verbose) @@ -710,6 +744,7 @@ def permute_cell_labels(self, permutations=100, evaluation='communication', fdr_ cutoff_setup=self.cutoff_setup, analysis_setup=self.analysis_setup, complex_sep=self.complex_sep, + complex_agg_method=self.complex_agg_method, interaction_columns=self.interaction_columns, verbose=False) @@ -757,7 +792,8 @@ def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, spatial_dict, score_type def initialize_interaction_space(rnaseq_data, ppi_data, cutoff_setup, analysis_setup, excluded_cells=None, - complex_sep=None, interaction_columns=('A', 'B'), verbose=True): + complex_sep=None, complex_agg_method='min', interaction_columns=('A', 'B'), + verbose=True): '''Initializes a InteractionSpace object to perform the analyses Parameters @@ -838,6 +874,13 @@ def initialize_interaction_space(rnaseq_data, ppi_data, cutoff_setup, analysis_s For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + interaction_columns : tuple, default=('A', 'B') Contains the names of the columns where to find the partners in a dataframe of protein-protein interactions. If the list is for @@ -866,6 +909,7 @@ def initialize_interaction_space(rnaseq_data, ppi_data, cutoff_setup, analysis_s cci_score=analysis_setup['cci_score'], cci_type=analysis_setup['cci_type'], complex_sep=complex_sep, + complex_agg_method=complex_agg_method, interaction_columns=interaction_columns, verbose=verbose) return interaction_space \ No newline at end of file diff --git a/cell2cell/core/interaction_space.py b/cell2cell/core/interaction_space.py index 8ccd047..b4cdf87 100644 --- a/cell2cell/core/interaction_space.py +++ b/cell2cell/core/interaction_space.py @@ -69,7 +69,8 @@ def generate_pairs(cells, cci_type, self_interaction=True, remove_duplicates=Tru def generate_interaction_elements(modified_rnaseq, ppi_data, cci_type='undirected', cci_matrix_template=None, - complex_sep=None, interaction_columns=('A', 'B'), verbose=True): + complex_sep=None, complex_agg_method='min', interaction_columns=('A', 'B'), + verbose=True): '''Create all elements needed to perform the analyses of pairwise cell-cell interactions/communication. Corresponds to the interaction elements used by the class InteractionSpace. @@ -103,6 +104,13 @@ def generate_interaction_elements(modified_rnaseq, ppi_data, cci_type='undirecte For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + interaction_columns : tuple, default=('A', 'B') Contains the names of the columns where to find the partners in a dataframe of protein-protein interactions. If the list is for @@ -134,7 +142,10 @@ def generate_interaction_elements(modified_rnaseq, ppi_data, cci_type='undirecte complex_sep=complex_sep, interaction_columns=interaction_columns ) - modified_rnaseq = add_complexes_to_expression(modified_rnaseq, complexes) + modified_rnaseq = add_complexes_to_expression(rnaseq_data=modified_rnaseq, + complexes=complexes, + agg_method=complex_agg_method + ) # Cells cell_instances = list(modified_rnaseq.columns) # @Erick, check if position 0 of columns contain index header. @@ -255,6 +266,13 @@ class InteractionSpace(): For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + interaction_columns : tuple, default=('A', 'B') Contains the names of the columns where to find the partners in a dataframe of protein-protein interactions. If the list is for @@ -324,7 +342,7 @@ class InteractionSpace(): def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, communication_score='expression_thresholding', cci_score='bray_curtis', cci_type='undirected', cci_matrix_template=None, complex_sep=None, - interaction_columns=('A', 'B'), verbose=True): + complex_agg_method='min', interaction_columns=('A', 'B'), verbose=True): self.communication_score = communication_score self.cci_score = cci_score @@ -361,6 +379,7 @@ def __init__(self, rnaseq_data, ppi_data, gene_cutoffs, communication_score='exp cci_matrix_template=cci_matrix_template, cci_type=self.cci_type, complex_sep=complex_sep, + complex_agg_method=complex_agg_method, verbose=verbose) self.interaction_elements['ppi_score'] = self.ppi_data['score'].values diff --git a/cell2cell/plotting/__init__.py b/cell2cell/plotting/__init__.py index f465b13..04eb920 100644 --- a/cell2cell/plotting/__init__.py +++ b/cell2cell/plotting/__init__.py @@ -3,7 +3,7 @@ from cell2cell.plotting.aesthetics import get_colors_from_labels, map_colors_to_metadata, generate_legend from cell2cell.plotting.ccc_plot import clustermap_ccc from cell2cell.plotting.cci_plot import clustermap_cci -from cell2cell.plotting.circos_plot import circos_plot +from cell2cell.plotting.circular_plot import circos_plot from cell2cell.plotting.pval_plot import dot_plot from cell2cell.plotting.factor_plot import context_boxplot, loading_clustermap, ccc_networks_plot from cell2cell.plotting.pcoa_plot import pcoa_3dplot diff --git a/cell2cell/plotting/circos_plot.py b/cell2cell/plotting/circular_plot.py similarity index 100% rename from cell2cell/plotting/circos_plot.py rename to cell2cell/plotting/circular_plot.py diff --git a/cell2cell/plotting/factor_plot.py b/cell2cell/plotting/factor_plot.py index f637cd6..24dce65 100644 --- a/cell2cell/plotting/factor_plot.py +++ b/cell2cell/plotting/factor_plot.py @@ -218,7 +218,7 @@ def context_boxplot(context_loadings, metadict, included_factors=None, group_ord def loading_clustermap(loadings, loading_threshold=0., use_zscore=True, metric='euclidean', method='ward', optimal_leaf=True, figsize=(15, 8), heatmap_lw=0.2, cbar_fontsize=12, tick_fontsize=10, cmap=None, - filename=None, **kwargs): + cbar_label=None, filename=None, **kwargs): ''' Plots a clustermap of the tensor-factorization loadings from one tensor dimension or the joint loadings from multiple tensor dimensions. @@ -285,6 +285,10 @@ def loading_clustermap(loadings, loading_threshold=0., use_zscore=True, metric=' Name of the color palette for coloring the heatmap. If None, cmap='Blues' would be used when use_zscore=False; and cmap='vlag' when use_zscore=True. + cbar_label : str, default=None + Label for the color bar. If None, default labels will be 'Z-scores \n across factors' + or 'Loadings', depending on `use_zcore` is True or False, respectively. + filename : str, default=None Path to save the figure of the elbow analysis. If None, the figure is not saved. @@ -305,13 +309,14 @@ def loading_clustermap(loadings, loading_threshold=0., use_zscore=True, metric=' cmap = 'vlag' val = np.ceil(max([abs(df.min().min()), abs(df.max().max())])) vmin, vmax = -1. * val, val - cbar_label = 'Z-scores \n across factors' + if cbar_label is None: + cbar_label = 'Z-scores \n across factors' else: if cmap is None: cmap='Blues' vmin, vmax = 0., df.max().max() - cbar_label = 'Loadings' - + if cbar_label is None: + cbar_label = 'Loadings' # Clustering dm_rows = compute_distance(df, axis=0, metric=metric) row_linkage = compute_linkage(dm_rows, method=method, optimal_ordering=optimal_leaf) diff --git a/cell2cell/preprocessing/rnaseq.py b/cell2cell/preprocessing/rnaseq.py index 069ab3c..63a08f4 100644 --- a/cell2cell/preprocessing/rnaseq.py +++ b/cell2cell/preprocessing/rnaseq.py @@ -141,7 +141,7 @@ def divide_expression_by_mean(rnaseq_data, axis=1): return new_data -def add_complexes_to_expression(rnaseq_data, complexes): +def add_complexes_to_expression(rnaseq_data, complexes, agg_method='min'): ''' Adds multimeric complexes into the gene expression matrix. Their gene expressions are the minimum expression value @@ -157,6 +157,13 @@ def add_complexes_to_expression(rnaseq_data, complexes): Dictionary where keys are the complex names in the list of PPIs, while values are list of subunits for the respective complex names. + agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + Returns ------- tmp_rna : pandas.DataFrame @@ -169,7 +176,14 @@ def add_complexes_to_expression(rnaseq_data, complexes): for k, v in complexes.items(): if all(g in tmp_rna.index for g in v): df = tmp_rna.loc[v, :] - tmp_rna.loc[k] = df.min().values.tolist() + if agg_method == 'min': + tmp_rna.loc[k] = df.min().values.tolist() + elif agg_method == 'mean': + tmp_rna.loc[k] = df.mean().values.tolist() + # elif agg_method == 'gmean': + # tmp_rna.loc[k] = df.gmean().values.tolist() # Not implemented + else: + ValueError("{} is not a valid agg_method".format(agg_method)) else: tmp_rna.loc[k] = [0] * tmp_rna.shape[1] return tmp_rna diff --git a/cell2cell/tensor/__init__.py b/cell2cell/tensor/__init__.py index ee70607..df91324 100644 --- a/cell2cell/tensor/__init__.py +++ b/cell2cell/tensor/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from cell2cell.tensor.external_scores import (dataframes_to_tensor) from cell2cell.tensor.factor_manipulation import (normalize_factors) from cell2cell.tensor.metrics import (correlation_index) from cell2cell.tensor.tensor import (InteractionTensor, PreBuiltTensor, build_context_ccc_tensor, generate_tensor_metadata, diff --git a/cell2cell/tensor/external_scores.py b/cell2cell/tensor/external_scores.py new file mode 100644 index 0000000..c40e7b5 --- /dev/null +++ b/cell2cell/tensor/external_scores.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd + +from collections import defaultdict +from cell2cell.tensor.tensor import PreBuiltTensor + + +def dataframes_to_tensor(context_df_dict, sender_col, receiver_col, ligand_col, receptor_col, score_col, how='inner', + lr_fill=np.nan, cell_fill=np.nan, lr_sep='^', context_order=None, order_labels=None, + sort_elements=True, device=None): + '''Generates an InteractionTensor from a dictionary + containing dataframes for all contexts. + + Parameters + ---------- + context_df_dict : dict + Dictionary containing a dataframe for each context. The dataframe + must contain columns containing sender cells, receiver cells, + ligands, receptors, and communication scores, separately. + Keys are context names and values are dataframes. + + sender_col : str + Name of the column containing the sender cells in all context + dataframes. + + receiver_col : str + Name of the column containing the receiver cells in all context + dataframes. + + ligand_col : str + Name of the column containing the ligands in all context + dataframes. + + receptor_col : str + Name of the column containing the receptors in all context + dataframes. + + score_col : str + Name of the column containing the communication scores in all context + dataframes. + + how : str, default='inner' + Approach to consider cell types and genes present across multiple contexts. + + - 'inner' : Considers only cell types and LR pairs that are present in all + contexts (intersection). + - 'outer' : Considers all cell types and LR pairs that are present + across contexts (union). + - 'outer_lrs' : Considers only cell types that are present in all + contexts (intersection), while all LR pairs that are + present across contexts (union). + - 'outer_cells' : Considers only LR pairs that are present in all + contexts (intersection), while all cell types that are + present across contexts (union). + + lr_fill : float, default=numpy.nan + Value to fill communication scores when a ligand-receptor pair is not + present across all contexts. + + cell_fill : float, default=numpy.nan + Value to fill communication scores when a cell is not + present across all ligand-receptor pairs or all contexts. + + lr_sep : str, default='^' + Separation character to join ligands and receptors into a LR pair name. + + context_order : list, default=None + List used to sort the contexts when building the tensor. Elements must + be all elements in context_df_dict.keys(). + + order_labels : list, default=None + List containing the labels for each order or dimension of the tensor. For + example: ['Contexts', 'Ligand-Receptor Pairs', 'Sender Cells', 'Receiver Cells'] + + sort_elements : boolean, default=True + Whether alphabetically sorting elements in the InteractionTensor. + The Context Dimension is not sorted if a 'context_order' list is provided. + + device : str, default=None + Device to use when backend is pytorch. Options are: + {'cpu', 'cuda:0', None} + + Returns + ------- + interaction_tensor : cell2cell.tensor.PreBuiltTensor + A communication tensor generated for the Tensor-cell2cell pipeline. + ''' + # Make sure that all contexts contain needed info + if context_order is None: + sort_context = sort_elements + context_order = list(context_df_dict.keys()) + else: + assert all([c in context_df_dict.keys() for c in context_order]), "The list 'context_order' must contain all context names contained in the keys of 'context_dict'" + assert len(context_order) == len(context_df_dict.keys()), "Each context name must be contained only once in the list 'context_order'" + sort_context = False + cols = [sender_col, receiver_col, ligand_col, receptor_col, score_col] + assert all([c in df.columns for c in cols for df in context_df_dict.values()]), "All input columns must be contained in all dataframes included in 'context_dict'" + + # Copy context dict to make modifications + cont_dict = {k : v.copy()[cols] for k, v in context_df_dict.items()} + + # Labels for each dimension + if order_labels is None: + order_labels = ['Contexts', 'Ligand-Receptor Pairs', 'Sender Cells', 'Receiver Cells'] + + # Find all existing LR pairs, sender and receiver cells across contexts + lr_dict = defaultdict(set) + sender_dict = defaultdict(set) + receiver_dict = defaultdict(set) + + for k, df in cont_dict.items(): + df['LRs'] = df.apply(lambda row: row[ligand_col] + lr_sep + row[receptor_col], axis=1) + # This is to consider only LR pairs in each context that are present in all cell pairs. Disabled for now. + # if how in ['inner', 'outer_cells']: + # ccc_df = df.pivot(index='LRs', columns=[sender_col, receiver_col], values=score_col) + # ccc_df = ccc_df.dropna(how='any') + # lr_dict[k].update(list(ccc_df.index)) + # else: + lr_dict[k].update(df['LRs'].unique().tolist()) + sender_dict[k].update(df[sender_col].unique().tolist()) + receiver_dict[k].update(df[receiver_col].unique().tolist()) + + # Subset LR pairs, sender and receiver cells given parameter 'how' + for i, k in enumerate(context_order): + if i == 0: + inter_lrs = set(lr_dict[k]) + inter_senders = set(sender_dict[k]) + inter_receivers = set(receiver_dict[k]) + + union_lrs = set(lr_dict[k]) + union_senders = set(sender_dict[k]) + union_receivers = set(receiver_dict[k]) + + else: + inter_lrs = inter_lrs.intersection(set(lr_dict[k])) + inter_senders = inter_senders.intersection(set(sender_dict[k])) + inter_receivers = inter_receivers.intersection(set(receiver_dict[k])) + + union_lrs = union_lrs.union(set(lr_dict[k])) + union_senders = union_senders.union(set(sender_dict[k])) + union_receivers = union_receivers.union(set(receiver_dict[k])) + + if how == 'inner': + lr_pairs = list(inter_lrs) + sender_cells = list(inter_senders) + receiver_cells = list(inter_receivers) + elif how == 'outer': + lr_pairs = list(union_lrs) + sender_cells = list(union_senders) + receiver_cells = list(union_receivers) + elif how == 'outer_lrs': + lr_pairs = list(union_lrs) + sender_cells = list(inter_senders) + receiver_cells = list(inter_receivers) + elif how == 'outer_cells': + lr_pairs = list(inter_lrs) + sender_cells = list(union_senders) + receiver_cells = list(union_receivers) + else: + raise ValueError("Not a valid input for parameter 'how'") + + if sort_elements: + if sort_context: + context_order = sorted(context_order) + lr_pairs = sorted(lr_pairs) + sender_cells = sorted(sender_cells) + receiver_cells = sorted(receiver_cells) + + # Build temporal tensor to pass to PreBuiltTensor + tmp_tensor = [] + for k in context_order: + v = cont_dict[k] + # 3D tensor for the context + tmp_3d_tensor = [] + for lr in lr_pairs: + df = v.loc[v['LRs'] == lr] + if df.shape[0] == 0: # TODO: Check behavior when df is empty + df = pd.DataFrame(lr_fill, index=sender_cells, columns=receiver_cells) + else: + df = df.pivot(index=sender_col, columns=receiver_col, values=score_col) + df = df.reindex(sender_cells, fill_value=cell_fill).reindex(receiver_cells, fill_value=cell_fill, axis='columns') + + tmp_3d_tensor.append(df.values) + tmp_tensor.append(tmp_3d_tensor) + + # Create InteractionTensor using PreBuiltTensor + tensor = np.asarray(tmp_tensor) + if how != 'inner': + mask = (~np.isnan(tensor)).astype(int) + loc_nans = np.ones(tensor.shape, dtype=int) - mask + else: + mask = None + loc_nans = np.zeros(tensor.shape, dtype=int) + + interaction_tensor = PreBuiltTensor(tensor=tensor, + order_names=[context_order, lr_pairs, sender_cells, receiver_cells], + order_labels=order_labels, + mask=mask, + loc_nans=loc_nans, + device=device) + return interaction_tensor \ No newline at end of file diff --git a/cell2cell/tensor/tensor.py b/cell2cell/tensor/tensor.py index fa80b96..b010d54 100644 --- a/cell2cell/tensor/tensor.py +++ b/cell2cell/tensor/tensor.py @@ -41,8 +41,14 @@ class BaseTensor(): - 'inner' : Considers only cell types and genes that are present in all contexts (intersection). - - 'outer' : Considers all cell types and genes present across contexts - (union). + - 'outer' : Considers all cell types and genes that are present + across contexts (union). + - 'outer_genes' : Considers only cell types that are present in all + contexts (intersection), while all genes that are + present across contexts (union). + - 'outer_cells' : Considers only genes that are present in all + contexts (intersection), while all cell types that are + present across contexts (union). tensor : tensorly.tensor Tensor object created with the library tensorly. @@ -90,9 +96,19 @@ class BaseTensor(): explained_variance : float Explained variance score for a tnesor factorization. - explained_variance_ratio_ : ndarray + explained_variance_ratio_ : ndarray list Percentage of variance explained by each of the factors. Only present when "normalize_loadings" is True. Otherwise, it is None. + + loc_nans : ndarray list + An array of shape equal to `tensor` with ones where NaN values were assigned + when building the tensor. Other values are zeros. It stores the + location of the NaN values. + + loc_zeros : ndarray list + An array of shape equal to `tensor` with ones where zeros that are not in + `loc_nans` are located. Other values are assigned a zero. It tracks the + real zero values rather than NaN values that were converted to zero. ''' def __init__(self): # Save variables for this class @@ -110,6 +126,8 @@ def __init__(self): self.mask = None self.explained_variance_ = None self.explained_variance_ratio_ = None + self.loc_nans = None + self.loc_zeros = None def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='svd', random_state=None, verbose=False, runs=1, normalize_loadings=True, var_ordered_factors=True, **kwargs): @@ -231,7 +249,7 @@ def compute_tensor_factorization(self, rank, tf_type='non_negative_cp', init='sv self.rank = rank def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp', init='random', random_state=None, - automatic_elbow=True, mask=None, ci='std', figsize=(4, 2.25), fontsize=14, filename=None, + automatic_elbow=True, manual_elbow=None, mask=None, ci='std', figsize=(4, 2.25), fontsize=14, filename=None, verbose=False, **kwargs): '''Elbow analysis on the error achieved by the Tensor Factorization for selecting the number of factors to use. A plot is made with the results. @@ -260,6 +278,11 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' Whether using an automatic strategy to find the elbow. If True, the method implemented by the package kneed is used. + manual_elbow : int, default=None + Rank or number of factors to highlight in the curve of error achieved by + the Tensor Factorization. This input is considered only when + `automatic_elbow=True` + mask : ndarray list, default=None Helps avoiding missing values during a tensor factorization. A mask should be a boolean array of the same shape as the original tensor and should be 0 @@ -314,9 +337,9 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' **kwargs ) if automatic_elbow: - rank = _compute_elbow(loss) + rank = int(_compute_elbow(loss)) else: - rank = None + rank = manual_elbow fig = plot_elbow(loss=loss, elbow=rank, figsize=figsize, @@ -339,9 +362,9 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' loss = [(i + 1, l) for i, l in enumerate(loss)] if automatic_elbow: - rank = _compute_elbow(loss) + rank = int(_compute_elbow(loss)) else: - rank = None + rank = manual_elbow fig = plot_multiple_run_elbow(all_loss=all_loss, ci=ci, @@ -353,8 +376,9 @@ def elbow_rank_selection(self, upper_rank=50, runs=20, tf_type='non_negative_cp' else: assert runs > 0, "Input runs must be an integer greater than 0" - if automatic_elbow: - self.rank = rank + self.rank = rank + if self.rank is not None: + assert(isinstance(rank, int)), 'rank must be an integer.' print('The rank at the elbow is: {}'.format(self.rank)) return fig, loss @@ -474,8 +498,14 @@ class InteractionTensor(BaseTensor): - 'inner' : Considers only cell types and genes that are present in all contexts (intersection). - - 'outer' : Considers all cell types and genes present across contexts - (union). + - 'outer' : Considers all cell types and genes that are present + across contexts (union). + - 'outer_genes' : Considers only cell types that are present in all + contexts (intersection), while all genes that are + present across contexts (union). + - 'outer_cells' : Considers only genes that are present in all + contexts (intersection), while all cell types that are + present across contexts (union). communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- @@ -497,6 +527,13 @@ class InteractionTensor(BaseTensor): For example, '&' is the complex_sep for a list of ligand-receptor pairs where a protein partner could be "CD74&CD44". + complex_agg_method : str, default='min' + Method to aggregate the expression value of multiple genes in a + complex. + + - 'min' : Minimum expression value among all genes. + - 'mean' : Average expression value among all genes. + upper_letter_comparison : boolean, default=True Whether making uppercase the gene names in the expression matrices and the protein names in the ppi_data to match their names and integrate their @@ -530,9 +567,9 @@ class InteractionTensor(BaseTensor): Whether printing or not steps of the analysis. ''' def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=None, how='inner', - communication_score='expression_mean', complex_sep=None, upper_letter_comparison=True, - interaction_columns=('A', 'B'), group_ppi_by=None, group_ppi_method='gmean', device=None, - verbose=True): + communication_score='expression_mean', complex_sep=None, complex_agg_method='min', + upper_letter_comparison=True, interaction_columns=('A', 'B'), group_ppi_by=None, + group_ppi_method='gmean', device=None, verbose=True): # Asserts if group_ppi_by is not None: assert group_ppi_by in ppi_data.columns, "Using {} for grouping PPIs is not possible. Not present among columns in ppi_data".format(group_ppi_by) @@ -549,7 +586,7 @@ def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=N complex_sep=complex_sep, interaction_columns=interaction_columns ) - mod_rnaseq_matrices = [add_complexes_to_expression(rnaseq, complexes) for rnaseq in rnaseq_matrices] + mod_rnaseq_matrices = [add_complexes_to_expression(rnaseq, complexes, agg_method=complex_agg_method) for rnaseq in rnaseq_matrices] else: mod_rnaseq_matrices = [df.copy() for df in rnaseq_matrices] @@ -573,6 +610,14 @@ def __init__(self, rnaseq_matrices, ppi_data, order_labels=None, context_names=N group_ppi_method=group_ppi_method, verbose=verbose) + # Distinguish NaNs from real zeros + if mask is None: + self.loc_nans = np.zeros(tensor.shape, dtype=int) + else: + self.loc_nans = np.ones(tensor.shape, dtype=int) - np.array(mask) + self.loc_zeros = (tensor == 0).astype(int) - self.loc_nans + self.loc_zeros = (self.loc_zeros > 0).astype(int) + # Generate names for the elements in each dimension (order) in the tensor if context_names is None: context_names = ['C-' + str(i) for i in range(1, len(mod_rnaseq_matrices)+1)] @@ -617,37 +662,57 @@ class PreBuiltTensor(BaseTensor): order_labels : list, default=None List containing the labels for each order or dimension of the tensor. For - example: ['Contexts', 'Ligand-Receptor Pairs', 'Sender Cells', 'Receiver Cells']' + example: ['Contexts', 'Ligand-Receptor Pairs', 'Sender Cells', 'Receiver Cells'] mask : ndarray list, default=None Helps avoiding missing values during a tensor factorization. A mask should be a boolean array of the same shape as the original tensor and should be 0 where the values are missing and 1 everywhere else. + loc_nans : ndarray list, default=None + An array of shape equal to `tensor` with ones where NaN values were assigned + when building the tensor. Other values are zeros. It stores the + location of the NaN values. + device : str, default=None Device to use when backend is pytorch. Options are: {'cpu', 'cuda:0', None} ''' - def __init__(self, tensor, order_names, order_labels=None, mask=None, device=None): + def __init__(self, tensor, order_names, order_labels=None, mask=None, loc_nans=None, loc_zeros=None, device=None): # Init BaseTensor BaseTensor.__init__(self) + # Location of NaNs and zeros + tmp_nans = (np.isnan(tensor)).astype(int) # Find extra NaNs that were not considered + if loc_nans is None: + self.loc_nans = np.zeros(tuple(tensor.shape), dtype=int) + else: + assert loc_nans.shape == tensor.shape, "`loc_nans` and `tensor` must be of the same shape" + self.loc_nans = np.array(loc_nans.copy()) + self.loc_nans = self.loc_nans + tmp_nans + self.loc_nans = (self.loc_nans > 0).astype(int) + + self.loc_zeros = (np.array(tensor) == 0.).astype(int) - self.loc_nans + self.loc_zeros = (self.loc_zeros > 0).astype(int) + + # Store tensor + tensor_ = np.nan_to_num(tensor) if device is None: - self.tensor = tl.tensor(tensor) + self.tensor = tl.tensor(tensor_) self.mask = mask else: if tl.get_backend() == 'pytorch': - self.tensor = tl.tensor(tensor, device=device) + self.tensor = tl.tensor(tensor_, device=device) if mask is not None: self.mask = tl.tensor(mask, device=device) else: self.mask = mask else: - self.tensor = tl.tensor(tensor) + self.tensor = tl.tensor(tensor_) self.mask = mask self.order_names = order_names if order_labels is None: - self.order_labels = ['Dimension-{}'.format(i+1) for i in range(self.tensor.shape)] + self.order_labels = ['Dimension-{}'.format(i + 1) for i in range(self.tensor.shape)] else: self.order_labels = order_labels assert len(self.tensor.shape) == len(self.order_labels), "The length of order_labels must match the number of orders/dimensions in the tensor" @@ -677,8 +742,14 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati - 'inner' : Considers only cell types and genes that are present in all contexts (intersection). - - 'outer' : Considers all cell types and genes present across contexts - (union). + - 'outer' : Considers all cell types and genes that are present + across contexts (union). + - 'outer_genes' : Considers only cell types that are present in all + contexts (intersection), while all genes that are + present across contexts (union). + - 'outer_cells' : Considers only genes that are present in all + contexts (intersection), while all cell types that are + present across contexts (union). communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- @@ -752,14 +823,25 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati ''' df_idxs = [list(rnaseq.index) for rnaseq in rnaseq_matrices] df_cols = [list(rnaseq.columns) for rnaseq in rnaseq_matrices] + inter_genes = set.intersection(*map(set, df_idxs)) + inter_cells = set.intersection(*map(set, df_cols)) + union_genes = set.union(*map(set, df_idxs)) + union_cells = set.union(*map(set, df_cols)) + if how == 'inner': - genes = set.intersection(*map(set, df_idxs)) - cells = set.intersection(*map(set, df_cols)) + genes = inter_genes + cells = inter_cells elif how == 'outer': - genes = set.union(*map(set, df_idxs)) - cells = set.union(*map(set, df_cols)) + genes = union_genes + cells = union_cells + elif how == 'outer_genes': + genes = union_genes + cells = inter_cells + elif how == 'outer_cells': + genes = inter_genes + cells = union_cells else: - raise ValueError('Provide a valid way to build the tensor; "how" must be "inner" or "outer" ') + raise ValueError('Provide a valid way to build the tensor; "how" must be "inner", "outer", "outer_genes" or "outer_cells"') # Preserve order or sort new set (either inner or outer) if set(df_idxs[0]) == genes: @@ -782,7 +864,6 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati if verbose: print('Building tensor for the provided context') - #tensors = [generate_ccc_tensor(rnaseq_data=rnaseq.reindex(genes, fill_value=0.).reindex(cells, axis='columns', fill_value=0.), tensors = [generate_ccc_tensor(rnaseq_data=rnaseq.reindex(genes).reindex(cells, axis='columns'), ppi_data=ppi_data_, communication_score=communication_score, @@ -798,15 +879,7 @@ def build_context_ccc_tensor(rnaseq_matrices, ppi_data, how='inner', communicati ppi_names = [row[interaction_columns[0]] + '^' + row[interaction_columns[1]] for idx, row in ppi_data_.iterrows()] # Generate mask: - if how == 'outer': - #mask_tensor = [] - #for rnaseq in rnaseq_matrices: - # rna_cells = list(rnaseq.columns) - # ppi_mask = pd.DataFrame(np.ones((len(rna_cells), len(rna_cells))), columns=rna_cells, index=rna_cells) - # ppi_mask = ppi_mask.reindex(cells, fill_value=0.).reindex(cells, axis='columns', fill_value=0.) # Here we mask those cells that are not in the rnaseq data - # rna_tensor = [ppi_mask.values for i in range(len(ppi_names))] # Replicate the mask across all PPIs in ppi_data_ - # mask_tensor.append(rna_tensor) - #mask_tensor = np.asarray(mask_tensor) + if how != 'inner': mask_tensor = (~np.isnan(np.asarray(tensors))).astype(int) else: mask_tensor = None @@ -1008,8 +1081,14 @@ def interactions_to_tensor(interactions, experiment='single_cell', context_names - 'inner' : Considers only cell types and genes that are present in all contexts (intersection). - - 'outer' : Considers all cell types and genes present across contexts - (union). + - 'outer' : Considers all cell types and genes that are present + across contexts (union). + - 'outer_genes' : Considers only cell types that are present in all + contexts (intersection), while all genes that are + present across contexts (union). + - 'outer_cells' : Considers only genes that are present in all + contexts (intersection), while all cell types that are + present across contexts (union). communication_score : str, default='expression_mean' Type of communication score to infer the potential use of a given ligand- diff --git a/release/0.5.11-notes.md b/release/0.5.11-notes.md new file mode 100644 index 0000000..f277b61 --- /dev/null +++ b/release/0.5.11-notes.md @@ -0,0 +1,31 @@ +# Release Notes - cell2cell v0.5.11 + +## New features +- Created a new function to use external communication scores, generated with other tools. This function can be found in +```cell2cell.tensor.external_scores.dataframes_to_tensor()```. +- Added ```cell2cell.tensor.tensor.BaseTensor.loc_nans```, ```cell2cell.tensor.tensor.BaseTensor.loc_zeros```, and same attributes in +heirs tensor classes to keep track of values assigned with NaNs and with real zeros, respectively. +- ```cell2cell.tensor.external_scores.dataframes_to_tensor()``` also incorporates the previous point to keep track +of NaNs and real zeros when using external communication scores. +- Added ```lr_fill``` and ```cell_fill``` parameters to ```cell2cell.tensor.external_scores.dataframes_to_tensor()``` + +## Feature updates +- Added two new options to the parameter ```how``` in ```cell2cell.tensor.build_context_ccc_tensor()```. +They are: ```how='outer_genes'``` and ```how='outer_cells'``` . These new options were also extended to all InteractionTensors +derived from ```cell2cell.tensor.tensor.BaseTensor```. +- These options of how were also extended to the new function ```cell2cell.tensor.external_scores.dataframes_to_tensor()```, +but here implemented as ```how='outer_lrs'``` and ```how='outer_cells'```. +- Implemented multiple to options to aggregate gene expression of protein complexes. Available options are using the +minimum expression or the average expression among the subunits. This can be controlled with the parameter +```complex_agg_method='min'``` or ```complex_agg_method='mean'``` when creating a ```cell2cell.tensor.InteractionTensor```, +```cell2cell.core.InteractionSpace```, ```cell2cell.analysis.BulkInteractions``` pipeline, or ```cell2cell.analysis.SingleCellInteractions``` pipeline. +- The previous point relies on the function ```cell2cell.preprocessing.rnaseq.add_complexes_to_expression()``` through +the parameter ```agg_method='min'``` or ```agg_method='mean'``` +- Added parameter ```cbar_label``` to the function ```cell2cell.plotting.factor_plot.loading_clustermap()``` +to personalize the title of the color bar. +- Added parameter ```manual_elbow``` to ```cell2cell.tensor.tensor.BaseTensor.elbow_rank_selection()``` to manually specify +the elbow to highlight. + +## Fixed Bugs +- Renamed ```cell2cell.plotting.circos_plot``` into ```cell2cell.plotting.circular_plot``` to avoid incompatibility with +function ```cell2cell.plotting.circos_plot.circos_plot()``` that is directly imported under ```cell2cell.plotting``` \ No newline at end of file