-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from Cellular-Semantics/add-translator_api_mapper
Add translator api mapper
- Loading branch information
Showing
7 changed files
with
219 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,5 @@ | ||
name: Docker | ||
|
||
# This workflow uses actions that are not certified by GitHub. | ||
# They are provided by a third-party and are governed by | ||
# separate terms of service, privacy policy, and support | ||
# documentation. | ||
|
||
on: | ||
workflow_dispatch: | ||
release: | ||
|
@@ -13,13 +8,15 @@ on: | |
env: | ||
# Use docker.io for Docker Hub if empty | ||
REGISTRY: ghcr.io | ||
# github.repository as <account>/<repo> | ||
# github.repository is in the format <account>/<repo> | ||
IMAGE_NAME: ${{ github.repository }} | ||
|
||
|
||
jobs: | ||
build-and-push-image: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
component: [anndata2rdf, translator_api_mapper] | ||
permissions: | ||
contents: read | ||
packages: write | ||
|
@@ -35,12 +32,6 @@ jobs: | |
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Extract metadata (tags, labels) for Docker | ||
id: meta | ||
uses: docker/[email protected] | ||
with: | ||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} | ||
|
||
- name: Setup environment | ||
run: echo "BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV | ||
|
||
|
@@ -50,11 +41,17 @@ jobs: | |
- name: Set up Docker Buildx | ||
id: buildx | ||
uses: docker/setup-buildx-action@v1 | ||
|
||
- name: Build and push Docker image | ||
|
||
- name: Extract metadata for ${{ matrix.component }} | ||
id: meta | ||
uses: docker/[email protected] | ||
with: | ||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/${{ matrix.component }} | ||
|
||
- name: Build and push Docker image for ${{ matrix.component }} | ||
uses: docker/[email protected] | ||
with: | ||
context: "./anndata2rdf/" | ||
context: "./${{ matrix.component }}/" | ||
push: true | ||
platforms: linux/amd64, linux/arm64 | ||
tags: ${{ steps.meta.outputs.tags }} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
FROM python:3.10 | ||
|
||
SHELL ["/bin/bash", "-c"] | ||
|
||
WORKDIR /app | ||
|
||
COPY requirements.txt ./ | ||
RUN pip3 install --upgrade pip && \ | ||
pip3 install --no-cache-dir -r requirements.txt | ||
|
||
COPY src/ ./src/ | ||
|
||
CMD ["python", "src/pipeline_mapper.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests==2.31.0 | ||
SPARQLWrapper==2.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from uniprot_ensembl_mapper.uniprot_ensembl_mapper import uniprot_ensembl_mapper | ||
|
||
# Links from Proteins (uniprot) to Genes (Ensembl) CL_KG#73 | ||
uniprot_ensembl_mapper() |
Empty file.
158 changes: 158 additions & 0 deletions
158
translator_api_mapper/src/uniprot_ensembl_mapper/uniprot_ensembl_mapper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import os | ||
import logging | ||
import requests | ||
from SPARQLWrapper import SPARQLWrapper, JSON, POST | ||
from typing import Dict, List | ||
|
||
logging.basicConfig(level=logging.WARNING) | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
# Node Normalization Endpoint | ||
NODE_NORMALIZATION_URL = "https://nodenormalization-sri.renci.org/get_normalized_nodes" | ||
# RDF4J local endpoint configuration | ||
ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://triplestore:8080/rdf4j-server/repositories/obask") | ||
|
||
# RO Relation: Gene produces Protein | ||
RO_0003000 = "http://purl.obolibrary.org/obo/RO_0003000" | ||
|
||
UNIPROT_SPARQL_QUERY = """ | ||
SELECT DISTINCT ?s | ||
WHERE { | ||
?s a ?o. FILTER(contains(str(?s), "https://identifiers.org/uniprot:")) | ||
} | ||
""" | ||
ENSEMBL_SPARQL_QUERY = """ | ||
SELECT DISTINCT ?s | ||
WHERE { | ||
?s a ?o. FILTER(contains(str(?s), "http://identifiers.org/ensembl/")) | ||
} | ||
""" | ||
|
||
# Configure the SPARQL query endpoint | ||
sparql_query = SPARQLWrapper(ENDPOINT_URL) | ||
sparql_query.setReturnFormat(JSON) | ||
# Configure the SPARQL update endpoint | ||
sparql_update = SPARQLWrapper(ENDPOINT_URL) | ||
sparql_update.setMethod(POST) | ||
|
||
UNIPROT_PREFIX = "https://identifiers.org/uniprot/" | ||
ENSEMBL_PREFIX = "http://identifiers.org/ensembl/" | ||
BATCH_SIZE = 1000 | ||
|
||
|
||
def run_query(query: str) -> List[str]: | ||
"""Executes a SPARQL SELECT query and returns the results as a list of URIs.""" | ||
sparql_query.setQuery(query) | ||
|
||
try: | ||
results = sparql_query.query().convert() | ||
return [result["s"]["value"] for result in results["results"]["bindings"]] | ||
except Exception as e: | ||
logger.error(f"An error occurred: {e}") | ||
return [] | ||
|
||
|
||
def uri_to_curie(uri_list: List[str]) -> List[str]: | ||
"""Converts RDF URIs to CURIEs for use with the Translator API.""" | ||
curie_list = [] | ||
for uri in uri_list: | ||
if uri.startswith(UNIPROT_PREFIX): | ||
curie_list.append(f"UniProtKB:{uri.replace(UNIPROT_PREFIX, '')}") | ||
elif uri.startswith(ENSEMBL_PREFIX): | ||
curie_list.append(f"ENSEMBL:{uri.replace(ENSEMBL_PREFIX, '')}") | ||
return curie_list | ||
|
||
|
||
def get_normalized_curies(curie_list: List[str]) -> Dict[str, List[str]]: | ||
"""Retrieves normalized equivalent CURIEs for given UniProt CURIEs.""" | ||
normalized_curies = {} | ||
result = requests.post(NODE_NORMALIZATION_URL, json={"curies": curie_list}) | ||
if result.status_code == 200: | ||
result_json = result.json() | ||
for curie in curie_list: | ||
if result_json.get(curie): | ||
for identifier in result_json.get(curie).get("equivalent_identifiers"): | ||
if "ENSEMBL" in identifier["identifier"]: | ||
if curie not in normalized_curies: | ||
normalized_curies[curie] = [identifier["identifier"]] | ||
else: | ||
normalized_curies[curie].append(identifier["identifier"]) | ||
else: | ||
logger.error(f"Error fetching normalized CURIEs") | ||
return normalized_curies | ||
|
||
|
||
def insert_triples_in_batch(triples: List[str]): | ||
"""Inserts a batch of RDF triples into the RDF4J repository.""" | ||
endpoint_url = f"{ENDPOINT_URL}/statements" | ||
|
||
headers = { | ||
"Content-Type": "application/sparql-update" | ||
} | ||
if not triples: | ||
return | ||
|
||
insert_query = f""" | ||
PREFIX RO: <http://purl.obolibrary.org/obo/RO_> | ||
PREFIX ensembl: <http://identifiers.org/ensembl/> | ||
PREFIX uniprot: <https://identifiers.org/uniprot/> | ||
INSERT DATA {{ | ||
{' '.join(triples)} | ||
}} | ||
""" | ||
|
||
try: | ||
response = requests.post(endpoint_url, data=insert_query, headers=headers) | ||
|
||
if response.status_code == 204: | ||
logger.info(f"Successfully inserted {len(triples)} triples.") | ||
else: | ||
logger.error(f"Failed to insert triples. Status Code: {response.status_code}") | ||
|
||
except requests.exceptions.RequestException as e: | ||
logger.error(f"Request failed: {e}") | ||
|
||
|
||
def uniprot_ensembl_mapper(): | ||
# Retrieve UniProt and Ensembl terms | ||
uniprot_terms = run_query(UNIPROT_SPARQL_QUERY) | ||
uniprot_curie_list = uri_to_curie(uniprot_terms) | ||
ensembl_terms = run_query(ENSEMBL_SPARQL_QUERY) | ||
ensembl_curie_list = uri_to_curie(ensembl_terms) | ||
|
||
# Get mappings | ||
normalized_curie_dict = get_normalized_curies(uniprot_curie_list) | ||
|
||
triples_batch = [] | ||
count = 0 | ||
missing_count = 0 | ||
|
||
for uni, ensembl_list in normalized_curie_dict.items(): | ||
for ensembl_id in ensembl_list: | ||
count += 1 | ||
ensembl_uri = ensembl_id.replace("ENSEMBL:", ENSEMBL_PREFIX) | ||
uniprot_uri = uni.replace("UniProtKB:", UNIPROT_PREFIX) | ||
|
||
if ensembl_id in ensembl_curie_list: | ||
triple = f"<{ensembl_uri}> <{RO_0003000}> <{uniprot_uri}> ." | ||
triples_batch.append(triple) | ||
|
||
# Insert in batches | ||
if len(triples_batch) >= BATCH_SIZE: | ||
insert_triples_in_batch(triples_batch) | ||
triples_batch = [] | ||
|
||
else: | ||
missing_count += 1 | ||
|
||
# Insert remaining triples (if any) | ||
if triples_batch: | ||
insert_triples_in_batch(triples_batch) | ||
|
||
logger.info(f"Out of {count} ENSEMBL IDs, {missing_count} are missing") | ||
|
||
|
||
if __name__ == "__main__": | ||
uniprot_ensembl_mapper() |