From ff8c31acef3da0fb438dbefb43eb853f20f1c39e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 14:01:40 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/ml/datasets/pdb_data.py | 73 ++++++++++++++++++++------------ graphein/protein/utils.py | 4 +- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index e1482af4..f4ebc261 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -93,8 +93,6 @@ def __init__( self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz" - - self.pdb_dir = self.root_dir / "pdb" if not os.path.exists(self.pdb_dir): os.makedirs(self.pdb_dir) @@ -111,9 +109,13 @@ def __init__( self.pdb_deposition_date_url ).name self.pdb_availability_filename = Path(self.pdb_availability_url).name - self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name + self.pdb_chain_cath_uniprot_filename = Path( + self.pdb_chain_cath_uniprot_url + ).name self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name - self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name + self.pdb_chain_ec_number_filename = Path( + self.pdb_chain_ec_number_url + ).name self.list_columns = ["ligands"] @@ -428,16 +430,20 @@ def _download_pdb_availability(self): log.info("Downloading PDB availability map...") wget.download(self.pdb_availability_url, out=str(self.root_dir)) log.debug("Downloaded PDB availability map") - + def _download_pdb_chain_cath_uniprot_map(self): """Download mapping from PDB chain to uniprot accession and CATH ID from https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz """ - if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename): + if not os.path.exists( + self.root_dir / self.pdb_chain_cath_uniprot_filename + ): log.info("Downloading Uniprot CATH map...") - wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)) + wget.download( + self.pdb_chain_cath_uniprot_url, out=str(self.root_dir) + ) log.debug("Downloaded Uniprot CATH map") - + def _download_cath_id_cath_code_map(self): """Download mapping from CATH IDs to CATH code from http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz @@ -451,7 +457,9 @@ def _download_pdb_chain_ec_number_map(self): """Download mapping from PDB chains to EC number from https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz """ - if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename): + if not os.path.exists( + self.root_dir / self.pdb_chain_ec_number_filename + ): log.info("Downloading EC number map...") wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir)) log.debug("Downloaded EC number map") @@ -589,16 +597,18 @@ def _parse_uniprot_id(self) -> Dict[str, str]: :rtype: Dict[str, str] """ uniprot_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: for line in f: try: - pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") key = f"{pdb}_{chain}" uniprot_mapping[key] = uniprot_id except ValueError: continue return uniprot_mapping - + def _parse_cath_id(self) -> Dict[str, str]: """Parse the CATH ID for all PDB chains. @@ -607,17 +617,19 @@ def _parse_cath_id(self) -> Dict[str, str]: :rtype: Dict[str, str] """ cath_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f: - next(f) # Skip header line + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: + next(f) # Skip header line for line in f: try: - pdb, chain, uniprot_id, cath_id = line.strip().split('\t') + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") key = f"{pdb}_{chain}" cath_mapping[key] = cath_id except ValueError: continue return cath_mapping - + def _parse_cath_code(self) -> Dict[str, str]: """Parse the CATH code for all CATH IDs. @@ -626,18 +638,22 @@ def _parse_cath_code(self) -> Dict[str, str]: :rtype: Dict[str, str] """ cath_mapping = {} - with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f: + with gzip.open( + self.root_dir / self.cath_id_cath_code_filename, "rt" + ) as f: print(f) for line in f: print(line) try: - cath_id, cath_version, cath_code, cath_segment = line.strip().split() + cath_id, cath_version, cath_code, cath_segment = ( + line.strip().split() + ) cath_mapping[cath_id] = cath_code print(cath_id, cath_code) except ValueError: continue return cath_mapping - + def _parse_ec_number(self) -> Dict[str, str]: """Parse the CATH ID for all PDB chains and adds None when no EC number is present. @@ -646,13 +662,17 @@ def _parse_ec_number(self) -> Dict[str, str]: :rtype: Dict[str, str] """ ec_mapping = {} - with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f: - next(f) # Skip header line + with gzip.open( + self.root_dir / self.pdb_chain_ec_number_filename, "rt" + ) as f: + next(f) # Skip header line for line in f: try: - pdb, chain, uniprot_id, ec_number = line.strip().split('\t') + pdb, chain, uniprot_id, ec_number = line.strip().split( + "\t" + ) key = f"{pdb}_{chain}" - ec_number = None if ec_number == '?' else ec_number + ec_number = None if ec_number == "?" else ec_number ec_mapping[key] = ec_number except ValueError: continue @@ -1303,13 +1323,12 @@ def has_uniprot_id( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['uniprot_id']) + df = splits_df.dropna(subset=["uniprot_id"]) if update: self.df = df return df - def has_cath_code( self, splits: Optional[List[str]] = None, @@ -1329,7 +1348,7 @@ def has_cath_code( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['cath_code']) + df = splits_df.dropna(subset=["cath_code"]) if update: self.df = df @@ -1354,7 +1373,7 @@ def has_ec_number( :rtype: pd.DataFrame """ splits_df = self.get_splits(splits) - df = splits_df.dropna(subset=['ec_number']) + df = splits_df.dropna(subset=["ec_number"]) if update: self.df = df diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py index f69a0684..c16669f1 100644 --- a/graphein/protein/utils.py +++ b/graphein/protein/utils.py @@ -190,7 +190,9 @@ def download_pdb( BASE_URL = "https://models.rcsb.org/" extension = ".bcif.gz" else: - raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.") + raise ValueError( + f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'." + ) # Make output directory if it doesn't exist or set it to tempdir if None if out_dir is not None: