update return type

YQ-Wang · Apr 8, 2024 · bb6ae97 · bb6ae97
1 parent ef32e1d
commit bb6ae97
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# scBSP
+# scBSP - A Fast Tool for Single-Cell Spatially Variable Genes Identifications on Large-Scale Spatially Resolved Transcriptomics Data
 
 scBSP is a dedicated software package crafted for the nuanced domain of biological data processing, emphasizing gene expression analysis and cell coordinate evaluation. It offers a streamlined method to calculate p-values for a set of genes by leveraging input matrices that encapsulate cell coordinates and gene expression data.
 
@@ -64,4 +64,4 @@ p_values = scbsp.granp(input_sp_mat, input_exp_mat_raw, d1, d2)
 
 ## Output
 
-The function returns a list of p-values, each corresponding to the genes in the provided gene expression matrix. These p-values help in identifying significant differences in gene expression across different cell coordinates, facilitating advanced biological data analysis.
+The function returns a Pandas DataFrame, featuring two columns: `gene_names` and `p_values`. Each row within this DataFrame represents a unique gene from the input gene expression matrix. The `gene_names` column specifies the identifier for each gene, while the `p_values` column quantifies the statistical significance of the expression differences observed across various cell coordinates. This structured format enhances the ease of conducting sophisticated biological analyses, allowing for straightforward identification and investigation of genes with significant expression variability.
diff --git a/scbsp/scbsp.py b/scbsp/scbsp.py
@@ -204,7 +204,7 @@ def granp(
     d1: float = 1.0,
     d2: float = 3.0,
     leaf_size: int = 80,
-) -> List[float]:
+) -> pd.DataFrame:
     """
     Calculates the p-values for genomic data.
 
@@ -216,8 +216,15 @@ def granp(
         leaf_size: An integer that determines the maximum number of points after which the Ball Tree algorithm opts for a brute-force search approach.
 
     Returns:
-        A list of p-values.
+        A Pandas DataFrame with columns ['gene_names', 'p_values'].
     """
+    # Extract column names if input_exp_mat_raw is a Pandas DataFrame, else use indices
+    if isinstance(input_exp_mat_raw, pd.DataFrame):
+        gene_names = input_exp_mat_raw.columns.astype(str).tolist()
+        input_exp_mat_raw = csr_matrix(input_exp_mat_raw)
+    else:
+        gene_names = [f'Gene_{i}' for i in range(input_exp_mat_raw.shape[1])]
+        input_exp_mat_raw = input_exp_mat_raw if isspmatrix_csr(input_exp_mat_raw) else csr_matrix(input_exp_mat_raw)
 
     # Scale the distance thresholds according to the geometric mean of data spread.
     scale_factor = (
@@ -231,10 +238,6 @@ def granp(
     d1 *= scale_factor
     d2 *= scale_factor
 
-    # Ensure the expression matrix is in csr_matrix format.
-    if not isspmatrix_csr(input_exp_mat_raw):
-        input_exp_mat_raw = csr_matrix(input_exp_mat_raw)
-
     t_matrix_sum = _get_test_scores(input_sp_mat, input_exp_mat_raw, d1, d2, leaf_size)
 
     # Calculate p-values
@@ -248,4 +251,7 @@ def granp(
         t_matrix_sum, scale=np.exp(log_norm_params[0]), s=log_norm_params[1]
     )
 
-    return p_values
+    return pd.DataFrame({
+        'gene_names': gene_names,
+        'p_values': p_values
+    })
diff --git a/test/test_scbsp.py b/test/test_scbsp.py
@@ -148,7 +148,7 @@ def test_p_value_calculation(self):
 
         p_values = granp(input_sp_mat, input_exp_mat_raw)
 
-        self.assertEqual(sum([(i < 0.0001).astype(int) for i in p_values[0:999]]), 996)
+        self.assertEqual((p_values['p_values'].iloc[0:999] < 0.0001).sum(), 996)
 
 
 if __name__ == "__main__":