Hot fix when using variant mapping with bgen files

I changed the logic flow to first check whitelist then do variant mapping, this is because if we are using a variant mapping file to map between the genotype snps and model snps the varid gets updated to varid_ when there is a match and varid takes a new id which doesn't match with ids in the whitelist. With this update we ensure that change does not affect the selection at whitelisting step.
hakyimlab · Oct 31, 2024 · e7ecd1e · e7ecd1e
1 parent 4b3ad95
commit e7ecd1e
Showing 1 changed file with 4 additions and 5 deletions.
diff --git a/software/metax/genotype/BGENGenotype.py b/software/metax/genotype/BGENGenotype.py
@@ -35,6 +35,9 @@ def bgen_file_geno_lines(file, variant_mapping = None, force_colon = False, use_
             if chr == "NA" or pos == "NA":
                 continue
 
+        if whitelist and not varid in whitelist:
+            continue
+
         if variant_mapping:
             if dict_mapping:
                 if not varid in variant_mapping:
@@ -48,10 +51,6 @@ def bgen_file_geno_lines(file, variant_mapping = None, force_colon = False, use_
         # the alleles in the genotype might be swapped respect the variant in the mapping
         # You should verify if you must match it
 
-
-        if whitelist and not varid in whitelist:
-            continue
-
         v = bgen["genotype"][variant.Index].compute()
         if v["phased"]:
             d = numpy.apply_along_axis(lambda x: x[1] + x[3], 1, numpy.array(v["probs"], dtype=float))
@@ -74,4 +73,4 @@ def get_samples(path):
     bgen = bgen_reader.read_bgen(path, verbose=False)
     samples = bgen["samples"].values
     samples = pandas.DataFrame({"FID":samples, "IID":samples})[["FID", "IID"]]
-    return samples
+    return samples