diff --git a/pyproject.toml b/pyproject.toml index 3c94e19..dc14b6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "fraposa-pgsc" -version = "1.0.0" +version = "1.0.1" description = "Tools to perform ancestry projection to a reference dataset within the calculator pipeline (pgsc_calc)" homepage = "https://github.com/PGScatalog/fraposa_pgsc" authors = ["smlmbrt "] diff --git a/src/fraposa_pgsc/fraposa.py b/src/fraposa_pgsc/fraposa.py index 9616c60..d3be16b 100644 --- a/src/fraposa_pgsc/fraposa.py +++ b/src/fraposa_pgsc/fraposa.py @@ -22,7 +22,6 @@ from sklearn.utils.extmath import randomized_svd from typing import Union -from .sampleid import SampleID def create_logger(out_filepref='fraposa'): log = logging.getLogger() @@ -139,25 +138,24 @@ def read_bed(bed_filepref, dtype=np.int8, filt_iid=None): n = len(fam) if filt_iid: - fam_ids = set(SampleID(x, y) for x, y in zip(fam['fid'], fam['iid'], strict=True)) - matched_ids = filt_iid.intersection(fam_ids) - if len(matched_ids) == 0: + fam_ids = list(zip(fam['fid'], fam['iid'])) # create tuples of ids from genotyping files + fam_mask = [x in filt_iid for x in fam_ids] # T/F overlap of genotype data and filter IDs (tuples) + n_matched = sum(fam_mask) + if n_matched == 0: raise ValueError(f"ERROR: 0 / {len(filt_iid)} ids in filter list match the study dataset") - elif len(fam_ids) != len(fam): + elif len(set(fam_ids)) != len(fam): raise ValueError("Samples with duplicated FID + IID detected, please remove and retry") - bed = np.zeros(shape=(p, len(matched_ids)), dtype=dtype) - # in will call SampleID's __hash__ method which uses (fid, iid) - fam_mask = pd.Series((x in matched_ids for x in fam_ids), dtype=bool) - i_extract = np.where(fam_mask == True) + bed = np.zeros(shape=(p, n_matched), dtype=dtype) + i_extract = [i for i,x in enumerate(fam_mask) if x is True] # idx to extract from genotype matrix for (i, (snp, genotypes)) in enumerate(pyp): bed[i,:] = genotypes[i_extract] fam = fam.loc[fam_mask,:] - if len(matched_ids) < len(filt_iid): - logging.warning('Warning: only {} / {} ids in filter list match the study dataset'.format(len(matched_ids), + if n_matched < len(filt_iid): + logging.warning('Warning: only {} / {} ids in filter list match the study dataset'.format(n_matched, len(filt_iid))) else: - logging.info('Extracted {} samples from study genotyping data'.format(len(matched_ids))) + logging.info('Extracted {} samples from study genotyping data'.format(n_matched)) else: bed = np.zeros(shape=(p, n), dtype=dtype) for (i, (snp, genotypes)) in enumerate(pyp): diff --git a/src/fraposa_pgsc/fraposa_runner.py b/src/fraposa_pgsc/fraposa_runner.py index 1c440ca..c5a42bb 100755 --- a/src/fraposa_pgsc/fraposa_runner.py +++ b/src/fraposa_pgsc/fraposa_runner.py @@ -2,7 +2,6 @@ import csv import fraposa_pgsc.fraposa as fp -from fraposa_pgsc.sampleid import SampleID import argparse @@ -38,8 +37,17 @@ def main(): try: with open(args.stu_filt_iid) as f: - reader = csv.reader(f, delimiter="\t") - stu_filt_iid = set(SampleID(x[0], x[1]) for x in list(reader)) + reader = csv.reader(f, delimiter="\t") # reads columns as str + stu_filt_iid = [] + for x in reader: + if x[0] == "0": + stu_filt_iid.append((x[1],x[1])) # replace missing FID with IID + else: + stu_filt_iid.append((x[0], x[1])) # return FID, IID + l_input = len(stu_filt_iid) + stu_filt_iid = set(stu_filt_iid) + if l_input != len(stu_filt_iid): + raise ValueError("Duplicate IDs found in filter list") except TypeError: stu_filt_iid = None except IndexError: diff --git a/src/fraposa_pgsc/sampleid.py b/src/fraposa_pgsc/sampleid.py deleted file mode 100644 index b4e04f0..0000000 --- a/src/fraposa_pgsc/sampleid.py +++ /dev/null @@ -1,28 +0,0 @@ -class SampleID: - """ A sample ID from a plink fam file, including FID and IID """ - def __init__(self, fid, iid): - self._fid = fid - self._iid = iid - - def __repr__(self): - return f"{self.__class__.__name__}(fid={repr(self.fid)}, iid={repr(self.iid)})" - - @property - def fid(self): - if self._fid == "0": # 0 means missing :) - return self._iid - else: - return self._fid - - @property - def iid(self): - return self._iid - - def __hash__(self): - return hash((self.fid, self.iid)) - - def __eq__(self, other): - if not isinstance(other, SampleID): - return NotImplemented - - return self.fid == other.fid and self.iid == other.iid