Merge pull request #17 from earmingol/new_version

Updates for v0.6.2
earmingol · Nov 4, 2022 · 3254faa · 3254faa
2 parents 97f498b + 2b3ec53
commit 3254faa
Show file tree

Hide file tree

Showing 9 changed files with 263 additions and 37 deletions.
diff --git a/cell2cell/__init__.py b/cell2cell/__init__.py
@@ -14,4 +14,4 @@
 from cell2cell import tensor
 from cell2cell import utils
 
-__version__ = "0.6.1"
+__version__ = "0.6.2"
diff --git a/cell2cell/plotting/tensor_plot.py b/cell2cell/plotting/tensor_plot.py
@@ -7,6 +7,7 @@
 from matplotlib import pyplot as plt
 
 from cell2cell.plotting.aesthetics import generate_legend, get_colors_from_labels, map_colors_to_metadata
+from cell2cell.preprocessing.signal import smooth_curve
 
 
 def tensor_factors_plot(interaction_tensor, order_labels=None, reorder_elements=None, metadata=None,
@@ -341,7 +342,7 @@ def reorder_dimension_elements(factors, reorder_elements, metadata=None):
     return reordered_factors, new_metadata
 
 
-def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
+def plot_elbow(loss, elbow=None, figsize=(4, 2.25), ylabel='Normalized Error', fontsize=14, filename=None):
     '''Plots the errors of an elbow analysis with just one run of a tensor factorization
     for each rank.
 
@@ -356,7 +357,10 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
         elbow.
 
     figsize : tuple, default=(4, 2.25)
-            Figure size, width by height
+        Figure size, width by height
+
+    ylabel : str, default='Normalized Error'
+        Label for the y-axis
 
     fontsize : int, default=14
         Fontsize for axis labels.
@@ -376,7 +380,7 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
     plt.plot(*zip(*loss))
     plt.tick_params(axis='both', labelsize=fontsize)
     plt.xlabel('Rank', fontsize=int(1.2*fontsize))
-    plt.ylabel('Error', fontsize=int(1.2 * fontsize))
+    plt.ylabel(ylabel, fontsize=int(1.2 * fontsize))
 
     if elbow is not None:
         _ = plt.plot(*loss[elbow - 1], 'ro')
@@ -387,7 +391,8 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
     return fig
 
 
-def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), fontsize=14, filename=None):
+def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), ylabel='Normalized Error', fontsize=14,
+                            smooth=False, filename=None):
     '''Plots the errors of an elbow analysis with multiple runs of a tensor
     factorization for each rank.
 
@@ -406,11 +411,17 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f
         {'std', '95%'}
 
     figsize : tuple, default=(4, 2.25)
-            Figure size, width by height
+        Figure size, width by height
+
+    ylabel : str, default='Normalized Error'
+        Label for the y-axis
 
     fontsize : int, default=14
         Fontsize for axis labels.
 
+    smooth : boolean, default=False
+        Whether smoothing the curve with a Savitzky-Golay filter.
+
     filename : str, default=None
         Path to save the figure of the elbow analysis. If None, the figure is not
         saved.
@@ -426,6 +437,9 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f
     mean = np.nanmean(all_loss, axis=0)
     std = np.nanstd(all_loss, axis=0)
 
+    if smooth:
+        mean = smooth_curve(mean)
+
     # Plot Mean
     plt.plot(x, mean, 'ob')
 
@@ -443,7 +457,7 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f
 
     plt.tick_params(axis='both', labelsize=fontsize)
     plt.xlabel('Rank', fontsize=int(1.2*fontsize))
-    plt.ylabel('Error', fontsize=int(1.2 * fontsize))
+    plt.ylabel(ylabel, fontsize=int(1.2 * fontsize))
 
     if elbow is not None:
         _ = plt.plot(x[elbow - 1], mean[elbow - 1], 'ro')

diff --git a/cell2cell/preprocessing/__init__.py b/cell2cell/preprocessing/__init__.py
@@ -17,4 +17,6 @@
                                          get_genes_from_complexes, preprocess_ppi_data)
 from cell2cell.preprocessing.rnaseq import (divide_expression_by_max, divide_expression_by_mean, drop_empty_genes,
                                             log10_transformation, scale_expression_by_sum, add_complexes_to_expression,
-                                            aggregate_single_cells)
+                                            aggregate_single_cells)
+
+from cell2cell.preprocessing.signal import (smooth_curve)
diff --git a/cell2cell/preprocessing/signal.py b/cell2cell/preprocessing/signal.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+
+from scipy.signal import savgol_filter
+
+
+def smooth_curve(values, window_length=None, polyorder=3, **kwargs):
+    '''Apply a Savitzky-Golay filter to an array to smooth the curve.
+
+    Parameters
+    ----------
+    values : array-like
+        An array or list of values.
+
+    window_length : int, default=None
+        Size of the window of values to use too smooth the curve.
+
+    polyorder : int, default=3
+        The order of the polynomial used to fit the samples.
+
+    **kwargs : dict
+        Extra arguments for the scipy.signal.savgol_filter function.
+
+    Returns
+    -------
+    smooth_values : array-like
+        An array or list of values representing the smooth curvee.
+    '''
+    size = len(values)
+    if window_length is None:
+        window_length = int(size / min([2, size]))
+    if window_length % 2 == 0:
+        window_length += 1
+    assert(polyorder < window_length), "polyorder must be less than window_length."
+    smooth_values = savgol_filter(values, window_length, polyorder, **kwargs)
+    return smooth_values
diff --git a/cell2cell/tensor/__init__.py b/cell2cell/tensor/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from cell2cell.tensor.external_scores import (dataframes_to_tensor)
 from cell2cell.tensor.factor_manipulation import (normalize_factors)
-from cell2cell.tensor.metrics import (correlation_index)
+from cell2cell.tensor.metrics import (correlation_index, pairwise_correlation_index)
 from cell2cell.tensor.tensor import (InteractionTensor, PreBuiltTensor, build_context_ccc_tensor, generate_tensor_metadata,
                                      interactions_to_tensor)
 from cell2cell.tensor.tensor_manipulation import (concatenate_interaction_tensors)

diff --git a/cell2cell/tensor/factorization.py b/cell2cell/tensor/factorization.py
@@ -1,12 +1,15 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
+import scipy.spatial as sp
 from tqdm.auto import tqdm
 from kneed import KneeLocator
 
 from tensorly.decomposition import non_negative_parafac, non_negative_parafac_hals, parafac, constrained_parafac
 import tensorly as tl
 
+from cell2cell.tensor.metrics import pairwise_correlation_index
+
 
 def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init='svd', svd='numpy_svd', random_state=None,
                                   mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs):
@@ -140,20 +143,42 @@ def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init=
         return cp_tf
 
 
-def _compute_elbow(loss):
+def _compute_elbow(loss, S=1.0, curve='convex', direction='decreasing', interp_method='interp1d'):
     '''Computes the elbow of a curve
 
     Parameters
     ----------
     loss : list
         List of lists or tuples with (x, y) coordinates for a curve.
 
+    S : float, default=1.0
+        The sensitivity parameter allows us to adjust how aggressive
+        we want Kneedle to be when detecting knees. Smaller values
+        for S detect knees quicker, while larger values are more conservative.
+        Put simply, S is a measure of how many “flat” points we expect to
+        see in the unmodified data curve before declaring a knee.
+
+    curve : str, default='convex'
+        If curve='concave', kneed will detect knees. If curve='convex',
+        it will detect elbows.
+
+    direction : str, default='decreasing'
+        The direction parameter describes the line from left to right on the
+        x-axis. If the knee/elbow you are trying to identify is on a positive
+        slope use direction='increasing', if the knee/elbow you are trying to
+        identify is on a negative slope, use direction='decreasing'.
+
+    interp_method : str, default=''
+        This parameter controls the interpolation method for fitting a
+        spline to the input x and y data points.
+        Valid arguments are 'interp1d' and 'polynomial'.
+
     Returns
     -------
     rank : int
         X coordinate where the elbow is located in the curve.
     '''
-    kneedle = KneeLocator(*zip(*loss), S=1.0, curve="convex", direction="decreasing")
+    kneedle = KneeLocator(*zip(*loss), S=S, curve=curve, direction=direction, interp_method=interp_method)
     rank = kneedle.elbow
     return rank
 
@@ -245,7 +270,8 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init='
 
 
 def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_negative_cp', init='svd', svd='numpy_svd',
-                                  random_state=None, mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs):
+                                  metric='error', random_state=None, mask=None, n_iter_max=100, tol=10e-7,
+                                  verbose=False, **kwargs):
     '''Performs an elbow analysis with multiple runs of a tensor factorization for each
     rank
 
@@ -270,6 +296,15 @@ def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_n
         Initialization method for computing the Tensor Factorization.
         {‘svd’, ‘random’}
 
+    svd : str, default='numpy_svd'
+            Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS
+
+    metric : str, default='error'
+        Metric to perform the elbow analysis (y-axis)
+
+        - 'error' : Normalized error to compute the elbow.
+        - 'similarity' : Similarity based on CorrIndex (1-CorrIndex).
+
     random_state : int, default=None
         Seed for randomization.
 
@@ -315,11 +350,22 @@ def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_n
                                                               tol=tol,
                                                               verbose=verbose,
                                                               **kwargs)
-            # This helps to obtain proper error when the mask is not None.
-            if mask is None:
-                run_errors.append(tl.to_numpy(errors[-1]))
-            else:
-                run_errors.append(_compute_norm_error(tensor, tl_object, mask))
+
+            if metric == 'error':
+                # This helps to obtain proper error when the mask is not None.
+                if mask is None:
+                    run_errors.append(tl.to_numpy(errors[-1]))
+                else:
+                    run_errors.append(_compute_norm_error(tensor, tl_object, mask))
+            elif metric == 'similarity':
+                (weights, factors) = tl_object
+                run_errors.append(dict(zip(list(range(len(factors))), factors)))
+
+        if metric == 'similarity':
+            corridx_df = pairwise_correlation_index(run_errors)
+            run_errors = 1.0 - sp.distance.squareform(corridx_df.values)
+            #run_errors = 1.0 - corridx_df.max().values
+            run_errors = run_errors.tolist()
         all_loss.append(run_errors)
 
     all_loss = np.array(all_loss).T
@@ -365,8 +411,10 @@ def _compute_norm_error(tensor, tl_object, mask=None):
             mask_ = tl.tensor(mask, **context)
         else:
             mask_ = mask
-        diff = tl.tensor(np.ones(mask.shape), **context) - mask_
-        tensor_ = tensor * mask_ + rec_ * diff
+        #diff = tl.tensor(np.ones(mask.shape), **context) - mask_
+        #tensor_ = tensor * mask_ + rec_ * diff
+        tensor_ = tensor * mask_
+        rec_ = rec_ * mask_
     else:
         tensor_ = tensor
 

diff --git a/cell2cell/tensor/metrics.py b/cell2cell/tensor/metrics.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
+import pandas as pd
+
+from itertools import combinations
 
 # Authors: Hratch Baghdassarian <[email protected]>, Erick Armingol <[email protected]>
 # similarity metrics for tensor decompositions
@@ -122,4 +125,55 @@ def _compute_correlation_index(x1, x2, tol=5e-16):
     score = (1 / (n_elements)) * (np.sum(np.abs(np.max(c_prod_mtx, 1) - 1)) + np.sum(np.abs(np.max(c_prod_mtx, 0) - 1)))
     if score < tol:
         score = 0
-    return score
+    return score
+
+
+def pairwise_correlation_index(factors, tol=5e-16, method='stacked'):
+    '''
+    Computes the CorrIndex between all pairs of factors
+
+    Parameters
+    ----------
+    factors : list
+        List with multiple Ordered dictionaries, each containing a dataframe with
+        the factor loadings for each dimension/order of the tensor. This is the
+        result from a tensor decomposition, it can be found as the attribute
+        `factors` in any tensor class derived from the class BaseTensor
+        (e.g. BaseTensor.factors).
+
+    tol : float, default=5e-16
+        Precision threshold below which to call the CorrIndex score 0.
+
+    method : str, default='stacked'
+        Method to obtain the CorrIndex by comparing the A matrices from two decompositions.
+        Possible options are:
+
+        - 'stacked' : The original method implemented in [1]. Here all A matrices from the same decomposition are
+                      vertically concatenated, building a big A matrix for each decomposition.
+        - 'max_score' : This computes the CorrIndex for each pair of A matrices (i.e. between A_1 in factors_1 and
+                        factors_2, between A_2 in factors_1 and factors_2, and so on). Then the max score is
+                        selected (the most conservative approach). In other words, it selects the max score among the
+                        CorrIndexes computed dimension-wise.
+        - 'min_score' : Similar to 'max_score', but the min score is selected (the least conservative approach).
+        - 'avg_score' : Similar to 'max_score', but the avg score is selected.
+
+    Returns
+    -------
+    scores : pd.DataFrame
+         Dataframe with CorrIndex metric for each pair of decompositions.
+         This metric bounds are [0,1]; lower score indicates higher similarity between matrices
+    '''
+    N = len(factors)
+    idxs = list(range(N))
+    pairs = list(combinations(idxs, 2))
+    scores = pd.DataFrame(np.zeros((N, N)),index=idxs, columns=idxs)
+    for p1, p2 in pairs:
+        corrindex = correlation_index(factors_1=factors[p1],
+                                      factors_2=factors[p2],
+                                      tol=tol,
+                                      method=method
+                                      )
+
+        scores.at[p1, p2] = corrindex
+        scores.at[p2, p1] = corrindex
+    return scores