Skip to content

Commit

Permalink
Merge pull request #77 from Cellular-Semantics/add-translator_api_mapper
Browse files Browse the repository at this point in the history
Add translator api mapper
  • Loading branch information
ubyndr authored Feb 26, 2025
2 parents 6544f83 + e22850d commit 2e3b2be
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 24 deletions.
29 changes: 13 additions & 16 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
name: Docker

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

on:
workflow_dispatch:
release:
Expand All @@ -13,13 +8,15 @@ on:
env:
# Use docker.io for Docker Hub if empty
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
# github.repository is in the format <account>/<repo>
IMAGE_NAME: ${{ github.repository }}


jobs:
build-and-push-image:
runs-on: ubuntu-latest
strategy:
matrix:
component: [anndata2rdf, translator_api_mapper]
permissions:
contents: read
packages: write
Expand All @@ -35,12 +32,6 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/[email protected]
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Setup environment
run: echo "BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV

Expand All @@ -50,11 +41,17 @@ jobs:
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1

- name: Build and push Docker image

- name: Extract metadata for ${{ matrix.component }}
id: meta
uses: docker/[email protected]
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/${{ matrix.component }}

- name: Build and push Docker image for ${{ matrix.component }}
uses: docker/[email protected]
with:
context: "./anndata2rdf/"
context: "./${{ matrix.component }}/"
push: true
platforms: linux/amd64, linux/arm64
tags: ${{ steps.meta.outputs.tags }}
Expand Down
37 changes: 29 additions & 8 deletions cl_kb_pipeline/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ version: '3.8'

services:
anndata2rdf:
image: ghcr.io/cellular-semantics/cl_kg:latest
image: ghcr.io/cellular-semantics/cl_kg/anndata2rdf:latest
container_name: anndata_to_rdf
volumes:
- ../anndata2rdf/src/config:/app/src/config
- ../anndata2rdf/src/curated_data:/app/src/curated_data
- ../anndata2rdf/src/dataset:/app/src/dataset
- ./config/collectdata/local_ontologies:/app/src/graph/
command: /bin/sh -c "python src/process.py"

triplestore:
image: eclipse/rdf4j-workbench:3.7.7
container_name: triplestore
Expand All @@ -19,6 +20,7 @@ services:
- 8080:8080
volumes:
- triplestore_data:/var/rdf4j

obask-kb:
image: ghcr.io/obasktools/obask-kb:latest
container_name: obask-kb
Expand All @@ -43,6 +45,7 @@ services:
timeout: 12s
retries: 20
start_period: 3s

obask-collectdata:
image: ghcr.io/obasktools/pipeline-collectdata:latest
container_name: collectdata
Expand All @@ -54,6 +57,7 @@ services:
volumes:
- ./config/collectdata:/opt/conf_base/config/collectdata
- obask_data:/out

obask-updatetriplestore:
image: ghcr.io/obasktools/pipeline-updatetriplestore:latest
container_name: updatetriplestore
Expand All @@ -67,17 +71,19 @@ services:
volumes:
- ./config/updatetriplestore:/opt/conf_base/config/updatetriplestore
- obask_data:/data

obask-dumps:
image: ghcr.io/obasktools/pipeline-dumps:latest
container_name: dumps
depends_on:
obask-updatetriplestore:
pipeline-mapper:
condition: service_completed_successfully
links:
- triplestore
volumes:
- ./config/dumps:/opt/conf_base/config/dumps
- obask_data:/out

obask-updateprod:
image: ghcr.io/obasktools/pipeline-updateprod:latest
container_name: updateprod
Expand All @@ -92,21 +98,23 @@ services:
volumes:
- ./config/update-prod:/opt/conf_base/config/update-prod
- obask_data:/input

solr:
image: solr:8.11
container_name: solr
ports:
- 8993:8983
- 8993:8983
depends_on:
- obask-dumps
links:
- obask-dumps
volumes:
- solr_data:/var/solr
entrypoint:
- bash
- "-c"
- "precreate-core ontology; precreate-core bdsdump; exec solr -f"
- bash
- "-c"
- "precreate-core ontology; precreate-core bdsdump; exec solr -f"

obask-updatesolr:
image: ghcr.io/obasktools/pipeline-updatesolr:latest
container_name: updatesolr
Expand All @@ -117,11 +125,12 @@ services:
depends_on:
obask-dumps:
condition: service_completed_successfully

obask-ontology-search:
image: ghcr.io/obasktools/ontology-search:latest
container_name: ontology-search
ports:
- 8007:8007
ports:
- 8007:8007
environment:
- PYTHON_ENV=production
- OBASK_URL_PREFIX=/cl-kg
Expand All @@ -130,6 +139,18 @@ services:
- obask-updatesolr
links:
- solr

pipeline-mapper:
image: ghcr.io/cellular-semantics/cl_kg/translator_api_mapper:latest
container_name: pipeline_mapper
depends_on:
obask-updatetriplestore:
condition: service_completed_successfully
environment:
- ENDPOINT_URL=http://triplestore:8080/rdf4j-server/repositories/obask
links:
- triplestore

volumes:
obask_data:
solr_data:
Expand Down
13 changes: 13 additions & 0 deletions translator_api_mapper/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM python:3.10

SHELL ["/bin/bash", "-c"]

WORKDIR /app

COPY requirements.txt ./
RUN pip3 install --upgrade pip && \
pip3 install --no-cache-dir -r requirements.txt

COPY src/ ./src/

CMD ["python", "src/pipeline_mapper.py"]
2 changes: 2 additions & 0 deletions translator_api_mapper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.31.0
SPARQLWrapper==2.0.0
4 changes: 4 additions & 0 deletions translator_api_mapper/src/pipeline_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from uniprot_ensembl_mapper.uniprot_ensembl_mapper import uniprot_ensembl_mapper

# Links from Proteins (uniprot) to Genes (Ensembl) CL_KG#73
uniprot_ensembl_mapper()
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import os
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from typing import Dict, List

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Node Normalization Endpoint
NODE_NORMALIZATION_URL = "https://nodenormalization-sri.renci.org/get_normalized_nodes"
# RDF4J local endpoint configuration
ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://triplestore:8080/rdf4j-server/repositories/obask")

# RO Relation: Gene produces Protein
RO_0003000 = "http://purl.obolibrary.org/obo/RO_0003000"

UNIPROT_SPARQL_QUERY = """
SELECT DISTINCT ?s
WHERE {
?s a ?o. FILTER(contains(str(?s), "https://identifiers.org/uniprot:"))
}
"""
ENSEMBL_SPARQL_QUERY = """
SELECT DISTINCT ?s
WHERE {
?s a ?o. FILTER(contains(str(?s), "http://identifiers.org/ensembl/"))
}
"""

# Configure the SPARQL query endpoint
sparql_query = SPARQLWrapper(ENDPOINT_URL)
sparql_query.setReturnFormat(JSON)
# Configure the SPARQL update endpoint
sparql_update = SPARQLWrapper(ENDPOINT_URL)
sparql_update.setMethod(POST)

UNIPROT_PREFIX = "https://identifiers.org/uniprot/"
ENSEMBL_PREFIX = "http://identifiers.org/ensembl/"
BATCH_SIZE = 1000


def run_query(query: str) -> List[str]:
"""Executes a SPARQL SELECT query and returns the results as a list of URIs."""
sparql_query.setQuery(query)

try:
results = sparql_query.query().convert()
return [result["s"]["value"] for result in results["results"]["bindings"]]
except Exception as e:
logger.error(f"An error occurred: {e}")
return []


def uri_to_curie(uri_list: List[str]) -> List[str]:
"""Converts RDF URIs to CURIEs for use with the Translator API."""
curie_list = []
for uri in uri_list:
if uri.startswith(UNIPROT_PREFIX):
curie_list.append(f"UniProtKB:{uri.replace(UNIPROT_PREFIX, '')}")
elif uri.startswith(ENSEMBL_PREFIX):
curie_list.append(f"ENSEMBL:{uri.replace(ENSEMBL_PREFIX, '')}")
return curie_list


def get_normalized_curies(curie_list: List[str]) -> Dict[str, List[str]]:
"""Retrieves normalized equivalent CURIEs for given UniProt CURIEs."""
normalized_curies = {}
result = requests.post(NODE_NORMALIZATION_URL, json={"curies": curie_list})
if result.status_code == 200:
result_json = result.json()
for curie in curie_list:
if result_json.get(curie):
for identifier in result_json.get(curie).get("equivalent_identifiers"):
if "ENSEMBL" in identifier["identifier"]:
if curie not in normalized_curies:
normalized_curies[curie] = [identifier["identifier"]]
else:
normalized_curies[curie].append(identifier["identifier"])
else:
logger.error(f"Error fetching normalized CURIEs")
return normalized_curies


def insert_triples_in_batch(triples: List[str]):
"""Inserts a batch of RDF triples into the RDF4J repository."""
endpoint_url = f"{ENDPOINT_URL}/statements"

headers = {
"Content-Type": "application/sparql-update"
}
if not triples:
return

insert_query = f"""
PREFIX RO: <http://purl.obolibrary.org/obo/RO_>
PREFIX ensembl: <http://identifiers.org/ensembl/>
PREFIX uniprot: <https://identifiers.org/uniprot/>
INSERT DATA {{
{' '.join(triples)}
}}
"""

try:
response = requests.post(endpoint_url, data=insert_query, headers=headers)

if response.status_code == 204:
logger.info(f"Successfully inserted {len(triples)} triples.")
else:
logger.error(f"Failed to insert triples. Status Code: {response.status_code}")

except requests.exceptions.RequestException as e:
logger.error(f"Request failed: {e}")


def uniprot_ensembl_mapper():
# Retrieve UniProt and Ensembl terms
uniprot_terms = run_query(UNIPROT_SPARQL_QUERY)
uniprot_curie_list = uri_to_curie(uniprot_terms)
ensembl_terms = run_query(ENSEMBL_SPARQL_QUERY)
ensembl_curie_list = uri_to_curie(ensembl_terms)

# Get mappings
normalized_curie_dict = get_normalized_curies(uniprot_curie_list)

triples_batch = []
count = 0
missing_count = 0

for uni, ensembl_list in normalized_curie_dict.items():
for ensembl_id in ensembl_list:
count += 1
ensembl_uri = ensembl_id.replace("ENSEMBL:", ENSEMBL_PREFIX)
uniprot_uri = uni.replace("UniProtKB:", UNIPROT_PREFIX)

if ensembl_id in ensembl_curie_list:
triple = f"<{ensembl_uri}> <{RO_0003000}> <{uniprot_uri}> ."
triples_batch.append(triple)

# Insert in batches
if len(triples_batch) >= BATCH_SIZE:
insert_triples_in_batch(triples_batch)
triples_batch = []

else:
missing_count += 1

# Insert remaining triples (if any)
if triples_batch:
insert_triples_in_batch(triples_batch)

logger.info(f"Out of {count} ENSEMBL IDs, {missing_count} are missing")


if __name__ == "__main__":
uniprot_ensembl_mapper()

0 comments on commit 2e3b2be

Please sign in to comment.