Skip to content

Commit

Permalink
Merge pull request #17 from earmingol/new_version
Browse files Browse the repository at this point in the history
Updates for v0.6.2
  • Loading branch information
earmingol authored Nov 4, 2022
2 parents 97f498b + 2b3ec53 commit 3254faa
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 37 deletions.
2 changes: 1 addition & 1 deletion cell2cell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
from cell2cell import tensor
from cell2cell import utils

__version__ = "0.6.1"
__version__ = "0.6.2"
26 changes: 20 additions & 6 deletions cell2cell/plotting/tensor_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from matplotlib import pyplot as plt

from cell2cell.plotting.aesthetics import generate_legend, get_colors_from_labels, map_colors_to_metadata
from cell2cell.preprocessing.signal import smooth_curve


def tensor_factors_plot(interaction_tensor, order_labels=None, reorder_elements=None, metadata=None,
Expand Down Expand Up @@ -341,7 +342,7 @@ def reorder_dimension_elements(factors, reorder_elements, metadata=None):
return reordered_factors, new_metadata


def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
def plot_elbow(loss, elbow=None, figsize=(4, 2.25), ylabel='Normalized Error', fontsize=14, filename=None):
'''Plots the errors of an elbow analysis with just one run of a tensor factorization
for each rank.
Expand All @@ -356,7 +357,10 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
elbow.
figsize : tuple, default=(4, 2.25)
Figure size, width by height
Figure size, width by height
ylabel : str, default='Normalized Error'
Label for the y-axis
fontsize : int, default=14
Fontsize for axis labels.
Expand All @@ -376,7 +380,7 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
plt.plot(*zip(*loss))
plt.tick_params(axis='both', labelsize=fontsize)
plt.xlabel('Rank', fontsize=int(1.2*fontsize))
plt.ylabel('Error', fontsize=int(1.2 * fontsize))
plt.ylabel(ylabel, fontsize=int(1.2 * fontsize))

if elbow is not None:
_ = plt.plot(*loss[elbow - 1], 'ro')
Expand All @@ -387,7 +391,8 @@ def plot_elbow(loss, elbow=None, figsize=(4, 2.25), fontsize=14, filename=None):
return fig


def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), fontsize=14, filename=None):
def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), ylabel='Normalized Error', fontsize=14,
smooth=False, filename=None):
'''Plots the errors of an elbow analysis with multiple runs of a tensor
factorization for each rank.
Expand All @@ -406,11 +411,17 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f
{'std', '95%'}
figsize : tuple, default=(4, 2.25)
Figure size, width by height
Figure size, width by height
ylabel : str, default='Normalized Error'
Label for the y-axis
fontsize : int, default=14
Fontsize for axis labels.
smooth : boolean, default=False
Whether smoothing the curve with a Savitzky-Golay filter.
filename : str, default=None
Path to save the figure of the elbow analysis. If None, the figure is not
saved.
Expand All @@ -426,6 +437,9 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f
mean = np.nanmean(all_loss, axis=0)
std = np.nanstd(all_loss, axis=0)

if smooth:
mean = smooth_curve(mean)

# Plot Mean
plt.plot(x, mean, 'ob')

Expand All @@ -443,7 +457,7 @@ def plot_multiple_run_elbow(all_loss, elbow=None, ci='95%', figsize=(4, 2.25), f

plt.tick_params(axis='both', labelsize=fontsize)
plt.xlabel('Rank', fontsize=int(1.2*fontsize))
plt.ylabel('Error', fontsize=int(1.2 * fontsize))
plt.ylabel(ylabel, fontsize=int(1.2 * fontsize))

if elbow is not None:
_ = plt.plot(x[elbow - 1], mean[elbow - 1], 'ro')
Expand Down
4 changes: 3 additions & 1 deletion cell2cell/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@
get_genes_from_complexes, preprocess_ppi_data)
from cell2cell.preprocessing.rnaseq import (divide_expression_by_max, divide_expression_by_mean, drop_empty_genes,
log10_transformation, scale_expression_by_sum, add_complexes_to_expression,
aggregate_single_cells)
aggregate_single_cells)

from cell2cell.preprocessing.signal import (smooth_curve)
35 changes: 35 additions & 0 deletions cell2cell/preprocessing/signal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-

from scipy.signal import savgol_filter


def smooth_curve(values, window_length=None, polyorder=3, **kwargs):
'''Apply a Savitzky-Golay filter to an array to smooth the curve.
Parameters
----------
values : array-like
An array or list of values.
window_length : int, default=None
Size of the window of values to use too smooth the curve.
polyorder : int, default=3
The order of the polynomial used to fit the samples.
**kwargs : dict
Extra arguments for the scipy.signal.savgol_filter function.
Returns
-------
smooth_values : array-like
An array or list of values representing the smooth curvee.
'''
size = len(values)
if window_length is None:
window_length = int(size / min([2, size]))
if window_length % 2 == 0:
window_length += 1
assert(polyorder < window_length), "polyorder must be less than window_length."
smooth_values = savgol_filter(values, window_length, polyorder, **kwargs)
return smooth_values
2 changes: 1 addition & 1 deletion cell2cell/tensor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from cell2cell.tensor.external_scores import (dataframes_to_tensor)
from cell2cell.tensor.factor_manipulation import (normalize_factors)
from cell2cell.tensor.metrics import (correlation_index)
from cell2cell.tensor.metrics import (correlation_index, pairwise_correlation_index)
from cell2cell.tensor.tensor import (InteractionTensor, PreBuiltTensor, build_context_ccc_tensor, generate_tensor_metadata,
interactions_to_tensor)
from cell2cell.tensor.tensor_manipulation import (concatenate_interaction_tensors)
Expand Down
68 changes: 58 additions & 10 deletions cell2cell/tensor/factorization.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-

import numpy as np
import scipy.spatial as sp
from tqdm.auto import tqdm
from kneed import KneeLocator

from tensorly.decomposition import non_negative_parafac, non_negative_parafac_hals, parafac, constrained_parafac
import tensorly as tl

from cell2cell.tensor.metrics import pairwise_correlation_index


def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init='svd', svd='numpy_svd', random_state=None,
mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs):
Expand Down Expand Up @@ -140,20 +143,42 @@ def _compute_tensor_factorization(tensor, rank, tf_type='non_negative_cp', init=
return cp_tf


def _compute_elbow(loss):
def _compute_elbow(loss, S=1.0, curve='convex', direction='decreasing', interp_method='interp1d'):
'''Computes the elbow of a curve
Parameters
----------
loss : list
List of lists or tuples with (x, y) coordinates for a curve.
S : float, default=1.0
The sensitivity parameter allows us to adjust how aggressive
we want Kneedle to be when detecting knees. Smaller values
for S detect knees quicker, while larger values are more conservative.
Put simply, S is a measure of how many “flat” points we expect to
see in the unmodified data curve before declaring a knee.
curve : str, default='convex'
If curve='concave', kneed will detect knees. If curve='convex',
it will detect elbows.
direction : str, default='decreasing'
The direction parameter describes the line from left to right on the
x-axis. If the knee/elbow you are trying to identify is on a positive
slope use direction='increasing', if the knee/elbow you are trying to
identify is on a negative slope, use direction='decreasing'.
interp_method : str, default=''
This parameter controls the interpolation method for fitting a
spline to the input x and y data points.
Valid arguments are 'interp1d' and 'polynomial'.
Returns
-------
rank : int
X coordinate where the elbow is located in the curve.
'''
kneedle = KneeLocator(*zip(*loss), S=1.0, curve="convex", direction="decreasing")
kneedle = KneeLocator(*zip(*loss), S=S, curve=curve, direction=direction, interp_method=interp_method)
rank = kneedle.elbow
return rank

Expand Down Expand Up @@ -245,7 +270,8 @@ def _run_elbow_analysis(tensor, upper_rank=50, tf_type='non_negative_cp', init='


def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_negative_cp', init='svd', svd='numpy_svd',
random_state=None, mask=None, n_iter_max=100, tol=10e-7, verbose=False, **kwargs):
metric='error', random_state=None, mask=None, n_iter_max=100, tol=10e-7,
verbose=False, **kwargs):
'''Performs an elbow analysis with multiple runs of a tensor factorization for each
rank
Expand All @@ -270,6 +296,15 @@ def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_n
Initialization method for computing the Tensor Factorization.
{‘svd’, ‘random’}
svd : str, default='numpy_svd'
Function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS
metric : str, default='error'
Metric to perform the elbow analysis (y-axis)
- 'error' : Normalized error to compute the elbow.
- 'similarity' : Similarity based on CorrIndex (1-CorrIndex).
random_state : int, default=None
Seed for randomization.
Expand Down Expand Up @@ -315,11 +350,22 @@ def _multiple_runs_elbow_analysis(tensor, upper_rank=50, runs=10, tf_type='non_n
tol=tol,
verbose=verbose,
**kwargs)
# This helps to obtain proper error when the mask is not None.
if mask is None:
run_errors.append(tl.to_numpy(errors[-1]))
else:
run_errors.append(_compute_norm_error(tensor, tl_object, mask))

if metric == 'error':
# This helps to obtain proper error when the mask is not None.
if mask is None:
run_errors.append(tl.to_numpy(errors[-1]))
else:
run_errors.append(_compute_norm_error(tensor, tl_object, mask))
elif metric == 'similarity':
(weights, factors) = tl_object
run_errors.append(dict(zip(list(range(len(factors))), factors)))

if metric == 'similarity':
corridx_df = pairwise_correlation_index(run_errors)
run_errors = 1.0 - sp.distance.squareform(corridx_df.values)
#run_errors = 1.0 - corridx_df.max().values
run_errors = run_errors.tolist()
all_loss.append(run_errors)

all_loss = np.array(all_loss).T
Expand Down Expand Up @@ -365,8 +411,10 @@ def _compute_norm_error(tensor, tl_object, mask=None):
mask_ = tl.tensor(mask, **context)
else:
mask_ = mask
diff = tl.tensor(np.ones(mask.shape), **context) - mask_
tensor_ = tensor * mask_ + rec_ * diff
#diff = tl.tensor(np.ones(mask.shape), **context) - mask_
#tensor_ = tensor * mask_ + rec_ * diff
tensor_ = tensor * mask_
rec_ = rec_ * mask_
else:
tensor_ = tensor

Expand Down
56 changes: 55 additions & 1 deletion cell2cell/tensor/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd

from itertools import combinations

# Authors: Hratch Baghdassarian <[email protected]>, Erick Armingol <[email protected]>
# similarity metrics for tensor decompositions
Expand Down Expand Up @@ -122,4 +125,55 @@ def _compute_correlation_index(x1, x2, tol=5e-16):
score = (1 / (n_elements)) * (np.sum(np.abs(np.max(c_prod_mtx, 1) - 1)) + np.sum(np.abs(np.max(c_prod_mtx, 0) - 1)))
if score < tol:
score = 0
return score
return score


def pairwise_correlation_index(factors, tol=5e-16, method='stacked'):
'''
Computes the CorrIndex between all pairs of factors
Parameters
----------
factors : list
List with multiple Ordered dictionaries, each containing a dataframe with
the factor loadings for each dimension/order of the tensor. This is the
result from a tensor decomposition, it can be found as the attribute
`factors` in any tensor class derived from the class BaseTensor
(e.g. BaseTensor.factors).
tol : float, default=5e-16
Precision threshold below which to call the CorrIndex score 0.
method : str, default='stacked'
Method to obtain the CorrIndex by comparing the A matrices from two decompositions.
Possible options are:
- 'stacked' : The original method implemented in [1]. Here all A matrices from the same decomposition are
vertically concatenated, building a big A matrix for each decomposition.
- 'max_score' : This computes the CorrIndex for each pair of A matrices (i.e. between A_1 in factors_1 and
factors_2, between A_2 in factors_1 and factors_2, and so on). Then the max score is
selected (the most conservative approach). In other words, it selects the max score among the
CorrIndexes computed dimension-wise.
- 'min_score' : Similar to 'max_score', but the min score is selected (the least conservative approach).
- 'avg_score' : Similar to 'max_score', but the avg score is selected.
Returns
-------
scores : pd.DataFrame
Dataframe with CorrIndex metric for each pair of decompositions.
This metric bounds are [0,1]; lower score indicates higher similarity between matrices
'''
N = len(factors)
idxs = list(range(N))
pairs = list(combinations(idxs, 2))
scores = pd.DataFrame(np.zeros((N, N)),index=idxs, columns=idxs)
for p1, p2 in pairs:
corrindex = correlation_index(factors_1=factors[p1],
factors_2=factors[p2],
tol=tol,
method=method
)

scores.at[p1, p2] = corrindex
scores.at[p2, p1] = corrindex
return scores
Loading

0 comments on commit 3254faa

Please sign in to comment.