[feat] add uniprot ecnumber and cath label options to pdb manager (#398)

* [feat] add uniprot ecnumber and cath label options to pdb manager * [doc] added to changelog * [fix] address pandas warnings * [feat] improve type hinting and add subselecting based on labels * [fix] pin setuptools in setup.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pin setuptools version for CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Arian Jamasb <[email protected]>
a-r-j · Jun 5, 2024 · 27463a5 · 27463a5
1 parent 53a76be
commit 27463a5
Show file tree

Hide file tree

Showing 5 changed files with 286 additions and 13 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -40,6 +40,8 @@ jobs:
           channels: "conda-forge, salilab, pytorch, pyg"
           python-version: ${{ matrix.python-version }}
           use-mamba: true
+      - name: Install setuptools
+        run: pip install setuptools==69.5.1
       - name: Install Boost 1.7.3 (for DSSP)
         run: conda install -c anaconda libboost=1.73.0
       - name: Install DSSP

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396)
 
 #### Misc
+* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398)
 * bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391)
 * exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388)
 * Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
@@ -6,7 +6,7 @@
 from datetime import datetime
 from io import StringIO
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -36,13 +36,16 @@ def __init__(
         split_ratios: Optional[List[float]] = None,
         split_time_frames: Optional[List[np.datetime64]] = None,
         assign_leftover_rows_to_split_n: int = 0,
+        labels: Optional[
+            List[Literal["uniprot_id", "cath_code", "ec_number"]]
+        ] = None,
     ):
         """Instantiate a selection of experimental PDB structures.
 
         :param root_dir: The directory in which to store all PDB entries,
             defaults to ``"."``.
         :type root_dir: str, optional
-        :param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file.
+        :param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file.
             Defaults to ``"pdb"``.
         :type structure_format: str, optional
         :param splits: A list of names corresponding to each dataset split,
@@ -58,6 +61,9 @@ def __init__(
             to assign any rows remaining after creation of new dataset splits,
             defaults to ``0``.
         :type assign_leftover_rows_to_split_n: int, optional
+        :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
+            defaults to ``None``.
+        :type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional
         """
         # Arguments
         self.root_dir = Path(root_dir)
@@ -83,6 +89,12 @@ def __init__(
         )
         self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt"
 
+        self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz"
+
+        self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz"
+
+        self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"
+
         self.pdb_dir = self.root_dir / "pdb"
         if not os.path.exists(self.pdb_dir):
             os.makedirs(self.pdb_dir)
@@ -99,12 +111,19 @@ def __init__(
             self.pdb_deposition_date_url
         ).name
         self.pdb_availability_filename = Path(self.pdb_availability_url).name
+        self.pdb_chain_cath_uniprot_filename = Path(
+            self.pdb_chain_cath_uniprot_url
+        ).name
+        self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
+        self.pdb_chain_ec_number_filename = Path(
+            self.pdb_chain_ec_number_url
+        ).name
 
         self.list_columns = ["ligands"]
 
         # Data
         self.download_metadata()
-        self.df = self.parse()
+        self.df = self.parse(labels)
         self.source = self.df.copy()
 
         # Splits
@@ -146,6 +165,9 @@ def download_metadata(self):
         self._download_entry_metadata()
         self._download_exp_type()
         self._download_pdb_availability()
+        self._download_pdb_chain_cath_uniprot_map()
+        self._download_cath_id_cath_code_map()
+        self._download_pdb_chain_ec_number_map()
 
     def get_unavailable_pdb_files(
         self, splits: Optional[List[str]] = None
@@ -411,6 +433,39 @@ def _download_pdb_availability(self):
             wget.download(self.pdb_availability_url, out=str(self.root_dir))
             log.debug("Downloaded PDB availability map")
 
+    def _download_pdb_chain_cath_uniprot_map(self):
+        """Download mapping from PDB chain to uniprot accession and CATH ID from
+        https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
+        """
+        if not os.path.exists(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename
+        ):
+            log.info("Downloading Uniprot CATH map...")
+            wget.download(
+                self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)
+            )
+            log.debug("Downloaded Uniprot CATH map")
+
+    def _download_cath_id_cath_code_map(self):
+        """Download mapping from CATH IDs to CATH code from
+        http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
+        """
+        if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename):
+            log.info("Downloading CATH ID to CATH code map...")
+            wget.download(self.cath_id_cath_code_url, out=str(self.root_dir))
+            log.debug("Downloaded CATH ID to CATH code map")
+
+    def _download_pdb_chain_ec_number_map(self):
+        """Download mapping from PDB chains to EC number from
+        https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
+        """
+        if not os.path.exists(
+            self.root_dir / self.pdb_chain_ec_number_filename
+        ):
+            log.info("Downloading EC number map...")
+            wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
+            log.debug("Downloaded EC number map")
+
     def _parse_ligand_map(self) -> Dict[str, List[str]]:
         """Parse the ligand maps for all PDB records.
 
@@ -508,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
         df.dropna(subset=["id"], inplace=True)
 
         df.id = df.id.str.lower()
-        df.date = pd.to_datetime(df.date)
+        df.date = pd.to_datetime(df.date, format="%m/%d/%y")
         return pd.Series(df["date"].values, index=df["id"]).to_dict()
 
     def _parse_experiment_type(self) -> Dict[str, str]:
@@ -536,9 +591,107 @@ def _parse_pdb_availability(self) -> Dict[str, bool]:
         ids = {id: False for id in ids}
         return ids
 
-    def parse(self) -> pd.DataFrame:
+    def _parse_uniprot_id(self) -> Dict[str, str]:
+        """Parse the uniprot ID for all PDB chains.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding uniprot ID.
+        :rtype: Dict[str, str]
+        """
+        uniprot_mapping = {}
+        with gzip.open(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
+        ) as f:
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
+                    key = f"{pdb}_{chain}"
+                    uniprot_mapping[key] = uniprot_id
+                except ValueError:
+                    continue
+        return uniprot_mapping
+
+    def _parse_cath_id(self) -> Dict[str, str]:
+        """Parse the CATH ID for all PDB chains.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding CATH ID.
+        :rtype: Dict[str, str]
+        """
+        cath_mapping = {}
+        with gzip.open(
+            self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
+        ) as f:
+            next(f)  # Skip header line
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
+                    key = f"{pdb}_{chain}"
+                    cath_mapping[key] = cath_id
+                except ValueError:
+                    continue
+        return cath_mapping
+
+    def _parse_cath_code(self) -> Dict[str, str]:
+        """Parse the CATH code for all CATH IDs.
+
+        :return: Dictionary of CATH ID with their
+            corresponding CATH code.
+        :rtype: Dict[str, str]
+        """
+        cath_mapping = {}
+        with gzip.open(
+            self.root_dir / self.cath_id_cath_code_filename, "rt"
+        ) as f:
+            print(f)
+            for line in f:
+                print(line)
+                try:
+                    cath_id, cath_version, cath_code, cath_segment = (
+                        line.strip().split()
+                    )
+                    cath_mapping[cath_id] = cath_code
+                    print(cath_id, cath_code)
+                except ValueError:
+                    continue
+        return cath_mapping
+
+    def _parse_ec_number(self) -> Dict[str, str]:
+        """Parse the CATH ID for all PDB chains and adds None when no EC number is present.
+
+        :return: Dictionary of PDB chain ID with their
+            corresponding EC number.
+        :rtype: Dict[str, str]
+        """
+        ec_mapping = {}
+        with gzip.open(
+            self.root_dir / self.pdb_chain_ec_number_filename, "rt"
+        ) as f:
+            next(f)  # Skip header line
+            for line in f:
+                try:
+                    pdb, chain, uniprot_id, ec_number = line.strip().split(
+                        "\t"
+                    )
+                    key = f"{pdb}_{chain}"
+                    ec_number = None if ec_number == "?" else ec_number
+                    ec_mapping[key] = ec_number
+                except ValueError:
+                    continue
+        return ec_mapping
+
+    def parse(
+        self,
+        labels: Optional[
+            List[Literal["uniprot_id", "cath_code", "ec_number"]]
+        ] = None,
+    ) -> pd.DataFrame:
         """Parse all PDB sequence records.
 
+        :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe,
+            defaults to ``None``.
+        :type labels: Optional[List[str]], optional
+
         :return: DataFrame containing PDB sequence entries
             with their corresponding metadata.
         :rtype: pd.DataFrame
@@ -578,7 +731,15 @@ def parse(self) -> pd.DataFrame:
         df["deposition_date"] = df.pdb.map(self._parse_entries())
         df["experiment_type"] = df.pdb.map(self._parse_experiment_type())
         df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability())
-        df.pdb_file_available.fillna(True, inplace=True)
+        df["pdb_file_available"] = df["pdb_file_available"].fillna(True)
+        if labels:
+            if "uniprot_id" in labels:
+                df["uniprot_id"] = df.id.map(self._parse_uniprot_id())
+            if "cath_code" in labels:
+                df["cath_id"] = df.id.map(self._parse_cath_id())
+                df["cath_code"] = df.cath_id.map(self._parse_cath_code())
+            if "ec_number" in labels:
+                df["ec_number"] = df.id.map(self._parse_ec_number())
 
         return df
 
@@ -1150,6 +1311,105 @@ def select_complexes_with_grouped_molecule_types(
         if update:
             self.df = df
 
+    def has_uniprot_id(
+        self,
+        select_ids: Optional[List[str]] = None,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have a uniprot ID.
+
+        :param select_ids: If present, filter for only these IDs. If not present, filter for entries
+            that have any uniprot ID.
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=["uniprot_id"])
+
+        if select_ids:
+            df = df[df["uniprot_id"].isin(select_ids)]
+
+        if update:
+            self.df = df
+        return df
+
+    def has_cath_code(
+        self,
+        select_ids: Optional[List[str]] = None,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have a cath code.
+
+        :param select_ids: If present, filter for only these CATH codes. If not present, filter for entries
+            that have any cath code.
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=["cath_code"])
+
+        if select_ids:
+            df = df[df["cath_code"].isin(select_ids)]
+
+        if update:
+            self.df = df
+        return df
+
+    def has_ec_number(
+        self,
+        select_ids: Optional[List[str]] = None,
+        splits: Optional[List[str]] = None,
+        update: bool = False,
+    ) -> pd.DataFrame:
+        """
+        Select entries that have an EC number.
+
+        :param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries
+            that have any EC number
+            defaults to ``None``.
+        :type select_ids: Optional[List[str]], optional
+        :param splits: Names of splits for which to perform the operation,
+            defaults to ``None``.
+        :type splits: Optional[List[str]], optional
+        :param update: Whether to modify the DataFrame in place, defaults to
+            ``False``.
+        :type update: bool, optional
+
+        :return: DataFrame of selected molecules.
+        :rtype: pd.DataFrame
+        """
+        splits_df = self.get_splits(splits)
+        df = splits_df.dropna(subset=["ec_number"])
+
+        if select_ids:
+            df = df[df["ec_number"].isin(select_ids)]
+
+        if update:
+            self.df = df
+        return df
+
     def split_df_proportionally(
         self,
         df: pd.DataFrame,
@@ -1561,8 +1821,8 @@ def reset(self) -> pd.DataFrame:
 
     def download_pdbs(
         self,
-        out_dir=".",
-        format="pdb",
+        out_dir: str = ".",
+        format: str = "pdb",
         splits: Optional[List[str]] = None,
         overwrite: bool = False,
         max_workers: int = 8,
@@ -1572,7 +1832,7 @@ def download_pdbs(
 
         :param out_dir: Output directory, defaults to ``"."``
         :type out_dir: str, optional
-        :param format: Filetype to download. ``pdb`` or ``mmtf``.
+        :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``.
         :type format: str
         :param splits: Names of splits for which to perform the operation,
             defaults to ``None``.