Replace very slow merges with concatenations (of df subsets), and VER…

…Y SLOW list comparisons with set overlaps. Signed-off-by: smlmbrt <[email protected]>
PGScatalog · Mar 2, 2024 · a713443 · a713443
1 parent b1760da
commit a713443
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 11 deletions.
diff --git a/pgscatalog_utils/ancestry/ancestry_analysis.py b/pgscatalog_utils/ancestry/ancestry_analysis.py
@@ -55,18 +55,15 @@ def ancestry_analysis():
     scorecols = list(pgs.columns)
 
     ## There should be perfect target sample overlap
-    assert all(
-        [
-            x in pgs.loc["reference"].index
-            for x in reference_df.index.get_level_values(1)
-        ]
-    ), "Error: PGS data missing for reference samples with PCA data."
-    reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True)
+    assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\
+        "Error: PGS data missing for reference samples with PCA data."
+    reference_df = reference_df.sort_index()
+    reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1)
 
-    assert all(
-        [x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)]
-    ), "Error: PGS data missing for reference samples with PCA data."
-    target_df = pd.merge(target_df, pgs, left_index=True, right_index=True)
+    assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \
+        "Error: PGS data missing for target samples with PCA data."
+    target_df = target_df.sort_index()
+    target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1)
     del pgs  # clear raw PGS from memory
 
     # Compare target sample ancestry/PCs to reference panel

diff --git a/pgscatalog_utils/ancestry/tools.py b/pgscatalog_utils/ancestry/tools.py
@@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF
     :param p_threshold: used to define LowConfidence population assignments
     :return: dataframes for reference (predictions on training set) and target (predicted labels) datasets
     """
+    logger.debug("Starting ancestry comparison")
     # Check that datasets have the correct columns
     assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF'
     if method == 'Mahalanobis':