Skip to content
This repository has been archived by the owner on Jan 21, 2025. It is now read-only.

Commit

Permalink
Replace very slow merges with concatenations (of df subsets), and VER…
Browse files Browse the repository at this point in the history
…Y SLOW list comparisons with set overlaps.

Signed-off-by: smlmbrt <[email protected]>
  • Loading branch information
smlmbrt committed Mar 2, 2024
1 parent b1760da commit a713443
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 11 deletions.
19 changes: 8 additions & 11 deletions pgscatalog_utils/ancestry/ancestry_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,15 @@ def ancestry_analysis():
scorecols = list(pgs.columns)

## There should be perfect target sample overlap
assert all(
[
x in pgs.loc["reference"].index
for x in reference_df.index.get_level_values(1)
]
), "Error: PGS data missing for reference samples with PCA data."
reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True)
assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\
"Error: PGS data missing for reference samples with PCA data."
reference_df = reference_df.sort_index()
reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1)

assert all(
[x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)]
), "Error: PGS data missing for reference samples with PCA data."
target_df = pd.merge(target_df, pgs, left_index=True, right_index=True)
assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \
"Error: PGS data missing for target samples with PCA data."
target_df = target_df.sort_index()
target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1)
del pgs # clear raw PGS from memory

# Compare target sample ancestry/PCs to reference panel
Expand Down
1 change: 1 addition & 0 deletions pgscatalog_utils/ancestry/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF
:param p_threshold: used to define LowConfidence population assignments
:return: dataframes for reference (predictions on training set) and target (predicted labels) datasets
"""
logger.debug("Starting ancestry comparison")
# Check that datasets have the correct columns
assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF'
if method == 'Mahalanobis':
Expand Down

0 comments on commit a713443

Please sign in to comment.