Skip to content

Commit

Permalink
[feat] add uniprot ecnumber and cath label options to pdb manager (#398)
Browse files Browse the repository at this point in the history
* [feat] add uniprot ecnumber and cath label options to pdb manager

* [doc] added to changelog

* [fix] address pandas warnings

* [feat] improve type hinting and add subselecting based on labels

* [fix] pin setuptools in setup.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* pin setuptools version for CI

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Arian Jamasb <[email protected]>
  • Loading branch information
3 people authored Jun 5, 2024
1 parent 53a76be commit 27463a5
Show file tree
Hide file tree
Showing 5 changed files with 286 additions and 13 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
channels: "conda-forge, salilab, pytorch, pyg"
python-version: ${{ matrix.python-version }}
use-mamba: true
- name: Install setuptools
run: pip install setuptools==69.5.1
- name: Install Boost 1.7.3 (for DSSP)
run: conda install -c anaconda libboost=1.73.0
- name: Install DSSP
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396)

#### Misc
* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398)
* bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391)
* exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388)
* Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382)
Expand Down
278 changes: 269 additions & 9 deletions graphein/ml/datasets/pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -36,13 +36,16 @@ def __init__(
split_ratios: Optional[List[float]] = None,
split_time_frames: Optional[List[np.datetime64]] = None,
assign_leftover_rows_to_split_n: int = 0,
labels: Optional[
List[Literal["uniprot_id", "cath_code", "ec_number"]]
] = None,
):
"""Instantiate a selection of experimental PDB structures.
:param root_dir: The directory in which to store all PDB entries,
defaults to ``"."``.
:type root_dir: str, optional
:param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file.
:param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file.
Defaults to ``"pdb"``.
:type structure_format: str, optional
:param splits: A list of names corresponding to each dataset split,
Expand All @@ -58,6 +61,9 @@ def __init__(
to assign any rows remaining after creation of new dataset splits,
defaults to ``0``.
:type assign_leftover_rows_to_split_n: int, optional
:param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
defaults to ``None``.
:type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional
"""
# Arguments
self.root_dir = Path(root_dir)
Expand All @@ -83,6 +89,12 @@ def __init__(
)
self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt"

self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz"

self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz"

self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"

self.pdb_dir = self.root_dir / "pdb"
if not os.path.exists(self.pdb_dir):
os.makedirs(self.pdb_dir)
Expand All @@ -99,12 +111,19 @@ def __init__(
self.pdb_deposition_date_url
).name
self.pdb_availability_filename = Path(self.pdb_availability_url).name
self.pdb_chain_cath_uniprot_filename = Path(
self.pdb_chain_cath_uniprot_url
).name
self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
self.pdb_chain_ec_number_filename = Path(
self.pdb_chain_ec_number_url
).name

self.list_columns = ["ligands"]

# Data
self.download_metadata()
self.df = self.parse()
self.df = self.parse(labels)
self.source = self.df.copy()

# Splits
Expand Down Expand Up @@ -146,6 +165,9 @@ def download_metadata(self):
self._download_entry_metadata()
self._download_exp_type()
self._download_pdb_availability()
self._download_pdb_chain_cath_uniprot_map()
self._download_cath_id_cath_code_map()
self._download_pdb_chain_ec_number_map()

def get_unavailable_pdb_files(
self, splits: Optional[List[str]] = None
Expand Down Expand Up @@ -411,6 +433,39 @@ def _download_pdb_availability(self):
wget.download(self.pdb_availability_url, out=str(self.root_dir))
log.debug("Downloaded PDB availability map")

def _download_pdb_chain_cath_uniprot_map(self):
"""Download mapping from PDB chain to uniprot accession and CATH ID from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
"""
if not os.path.exists(
self.root_dir / self.pdb_chain_cath_uniprot_filename
):
log.info("Downloading Uniprot CATH map...")
wget.download(
self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)
)
log.debug("Downloaded Uniprot CATH map")

def _download_cath_id_cath_code_map(self):
"""Download mapping from CATH IDs to CATH code from
http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
"""
if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename):
log.info("Downloading CATH ID to CATH code map...")
wget.download(self.cath_id_cath_code_url, out=str(self.root_dir))
log.debug("Downloaded CATH ID to CATH code map")

def _download_pdb_chain_ec_number_map(self):
"""Download mapping from PDB chains to EC number from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
"""
if not os.path.exists(
self.root_dir / self.pdb_chain_ec_number_filename
):
log.info("Downloading EC number map...")
wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
log.debug("Downloaded EC number map")

def _parse_ligand_map(self) -> Dict[str, List[str]]:
"""Parse the ligand maps for all PDB records.
Expand Down Expand Up @@ -508,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
df.dropna(subset=["id"], inplace=True)

df.id = df.id.str.lower()
df.date = pd.to_datetime(df.date)
df.date = pd.to_datetime(df.date, format="%m/%d/%y")
return pd.Series(df["date"].values, index=df["id"]).to_dict()

def _parse_experiment_type(self) -> Dict[str, str]:
Expand Down Expand Up @@ -536,9 +591,107 @@ def _parse_pdb_availability(self) -> Dict[str, bool]:
ids = {id: False for id in ids}
return ids

def parse(self) -> pd.DataFrame:
def _parse_uniprot_id(self) -> Dict[str, str]:
"""Parse the uniprot ID for all PDB chains.
:return: Dictionary of PDB chain ID with their
corresponding uniprot ID.
:rtype: Dict[str, str]
"""
uniprot_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
uniprot_mapping[key] = uniprot_id
except ValueError:
continue
return uniprot_mapping

def _parse_cath_id(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains.
:return: Dictionary of PDB chain ID with their
corresponding CATH ID.
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
cath_mapping[key] = cath_id
except ValueError:
continue
return cath_mapping

def _parse_cath_code(self) -> Dict[str, str]:
"""Parse the CATH code for all CATH IDs.
:return: Dictionary of CATH ID with their
corresponding CATH code.
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(
self.root_dir / self.cath_id_cath_code_filename, "rt"
) as f:
print(f)
for line in f:
print(line)
try:
cath_id, cath_version, cath_code, cath_segment = (
line.strip().split()
)
cath_mapping[cath_id] = cath_code
print(cath_id, cath_code)
except ValueError:
continue
return cath_mapping

def _parse_ec_number(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains and adds None when no EC number is present.
:return: Dictionary of PDB chain ID with their
corresponding EC number.
:rtype: Dict[str, str]
"""
ec_mapping = {}
with gzip.open(
self.root_dir / self.pdb_chain_ec_number_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, ec_number = line.strip().split(
"\t"
)
key = f"{pdb}_{chain}"
ec_number = None if ec_number == "?" else ec_number
ec_mapping[key] = ec_number
except ValueError:
continue
return ec_mapping

def parse(
self,
labels: Optional[
List[Literal["uniprot_id", "cath_code", "ec_number"]]
] = None,
) -> pd.DataFrame:
"""Parse all PDB sequence records.
:param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
defaults to ``None``.
:type labels: Optional[List[str]], optional
:return: DataFrame containing PDB sequence entries
with their corresponding metadata.
:rtype: pd.DataFrame
Expand Down Expand Up @@ -578,7 +731,15 @@ def parse(self) -> pd.DataFrame:
df["deposition_date"] = df.pdb.map(self._parse_entries())
df["experiment_type"] = df.pdb.map(self._parse_experiment_type())
df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability())
df.pdb_file_available.fillna(True, inplace=True)
df["pdb_file_available"] = df["pdb_file_available"].fillna(True)
if labels:
if "uniprot_id" in labels:
df["uniprot_id"] = df.id.map(self._parse_uniprot_id())
if "cath_code" in labels:
df["cath_id"] = df.id.map(self._parse_cath_id())
df["cath_code"] = df.cath_id.map(self._parse_cath_code())
if "ec_number" in labels:
df["ec_number"] = df.id.map(self._parse_ec_number())

return df

Expand Down Expand Up @@ -1150,6 +1311,105 @@ def select_complexes_with_grouped_molecule_types(
if update:
self.df = df

def has_uniprot_id(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have a uniprot ID.
:param select_ids: If present, filter for only these IDs. If not present, filter for entries
that have any uniprot ID.
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional
:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["uniprot_id"])

if select_ids:
df = df[df["uniprot_id"].isin(select_ids)]

if update:
self.df = df
return df

def has_cath_code(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have a cath code.
:param select_ids: If present, filter for only these CATH codes. If not present, filter for entries
that have any cath code.
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional
:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["cath_code"])

if select_ids:
df = df[df["cath_code"].isin(select_ids)]

if update:
self.df = df
return df

def has_ec_number(
self,
select_ids: Optional[List[str]] = None,
splits: Optional[List[str]] = None,
update: bool = False,
) -> pd.DataFrame:
"""
Select entries that have an EC number.
:param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries
that have any EC number
defaults to ``None``.
:type select_ids: Optional[List[str]], optional
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
:type splits: Optional[List[str]], optional
:param update: Whether to modify the DataFrame in place, defaults to
``False``.
:type update: bool, optional
:return: DataFrame of selected molecules.
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=["ec_number"])

if select_ids:
df = df[df["ec_number"].isin(select_ids)]

if update:
self.df = df
return df

def split_df_proportionally(
self,
df: pd.DataFrame,
Expand Down Expand Up @@ -1561,8 +1821,8 @@ def reset(self) -> pd.DataFrame:

def download_pdbs(
self,
out_dir=".",
format="pdb",
out_dir: str = ".",
format: str = "pdb",
splits: Optional[List[str]] = None,
overwrite: bool = False,
max_workers: int = 8,
Expand All @@ -1572,7 +1832,7 @@ def download_pdbs(
:param out_dir: Output directory, defaults to ``"."``
:type out_dir: str, optional
:param format: Filetype to download. ``pdb`` or ``mmtf``.
:param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
:type format: str
:param splits: Names of splits for which to perform the operation,
defaults to ``None``.
Expand Down
Loading

0 comments on commit 27463a5

Please sign in to comment.