diff --git a/pgscatalog_utils/ancestry/ancestry_analysis.py b/pgscatalog_utils/ancestry/ancestry_analysis.py index a7dbd8e..4795aeb 100644 --- a/pgscatalog_utils/ancestry/ancestry_analysis.py +++ b/pgscatalog_utils/ancestry/ancestry_analysis.py @@ -55,18 +55,15 @@ def ancestry_analysis(): scorecols = list(pgs.columns) ## There should be perfect target sample overlap - assert all( - [ - x in pgs.loc["reference"].index - for x in reference_df.index.get_level_values(1) - ] - ), "Error: PGS data missing for reference samples with PCA data." - reference_df = pd.merge(reference_df, pgs, left_index=True, right_index=True) + assert set(reference_df.index.get_level_values(1)).issubset(pgs.loc["reference"].index),\ + "Error: PGS data missing for reference samples with PCA data." + reference_df = reference_df.sort_index() + reference_df = pd.concat([reference_df, pgs.loc[reference_df.index]], axis=1) - assert all( - [x in pgs.loc[args.d_target].index for x in target_df.index.get_level_values(1)] - ), "Error: PGS data missing for reference samples with PCA data." - target_df = pd.merge(target_df, pgs, left_index=True, right_index=True) + assert set(target_df.index.get_level_values(1)).issubset(pgs.loc[args.d_target].index), \ + "Error: PGS data missing for target samples with PCA data." + target_df = target_df.sort_index() + target_df = pd.concat([target_df, pgs.loc[target_df.index]], axis=1) del pgs # clear raw PGS from memory # Compare target sample ancestry/PCs to reference panel diff --git a/pgscatalog_utils/ancestry/tools.py b/pgscatalog_utils/ancestry/tools.py index 47cffaa..8ed8af9 100644 --- a/pgscatalog_utils/ancestry/tools.py +++ b/pgscatalog_utils/ancestry/tools.py @@ -61,6 +61,7 @@ def compare_ancestry(ref_df: pd.DataFrame, ref_pop_col: str, target_df: pd.DataF :param p_threshold: used to define LowConfidence population assignments :return: dataframes for reference (predictions on training set) and target (predicted labels) datasets """ + logger.debug("Starting ancestry comparison") # Check that datasets have the correct columns assert method in comparison_method_threshold.keys(), 'comparison method parameter must be Mahalanobis or RF' if method == 'Mahalanobis':