Skip to content

Commit

Permalink
Corrections to Allele/Var ID logic (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie authored Jun 28, 2024
1 parent 6611e68 commit 7b7bb44
Show file tree
Hide file tree
Showing 13 changed files with 261 additions and 195 deletions.
4 changes: 3 additions & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
[bumpversion]
current_version = 1.0.0
current_version = 1.2.0
commit = True
tag = False

[bumpversion:file:setup.py]

[bumpversion:file:Dockerfile]
20 changes: 15 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
FROM hailgenetics/hail:0.2.127-py3.11
FROM python:3.10-bullseye

COPY scripts /scripts
COPY requirements.txt /scripts/
# take as a command line argument, or
ARG RELEASE=${RELEASE:-1.2.0}

RUN pip install --no-cache-dir -r /scripts/requirements.txt
RUN apt update && apt install -y \
apt-transport-https \
bzip2 \
ca-certificates \
git \
gnupg \
openjdk-11-jdk-headless \
wget \
zip && \
rm -r /var/lib/apt/lists/* && \
rm -r /var/cache/apt/*

WORKDIR /scripts
RUN pip install --no-cache-dir git+https://github.com/populationgenomics/ClinvArbitration.git@${RELEASE}
122 changes: 122 additions & 0 deletions clinvarbitration/clinvar_by_codon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
Method file for re-sorting clinvar annotations by codon
Takes a VCF of annotated Pathogenic Clinvar Variants
re-indexes the data to be queryable on Transcript and Codon
writes the resulting Hail Table to the specified path
Data as input for this script should be a VCF, annotated by VEP 110
Compatibility with other versions of VEP is not guaranteed
This makes the assumption that the annotated data here
has been generated by summarise_clinvar_entries.py:
- SNV only
- Clinvar Pathogenic only
- ClinVar decision/alleles/gold stars are in INFO
"""

import json
import logging
from argparse import ArgumentParser
from collections import defaultdict

import hail as hl
from cyvcf2 import VCF, Variant


def pull_vep_from_header(vcf: VCF) -> list[str]:
"""
yank the CSQ line out of the VCF header
"""
for element in vcf.header_iter():
if element['HeaderType'] == 'INFO' and element['ID'] == 'CSQ':
return list(entry.lower() for entry in element['Description'].split('Format: ')[-1].rstrip('"').split('|'))
raise IndexError('CSQ element not found in header')


def variant_consequences(variant: Variant, csq_header: list[str]) -> list[dict[str, str]]:
"""
extracts the consequences for each transcript in this variant
Args:
variant (Variant):
csq_header ():
Returns:
a list of all CSQ entries, cast as a dict
"""

consequences: list[dict[str, str]] = []
for csq in variant.INFO['CSQ'].split(','):
csq_dict = dict(zip(csq_header, csq.split('|'), strict=True))
if 'missense_variant' in csq_dict['consequence']:
consequences.append(csq_dict)
return consequences


def cli_main():
"""
alternative access point with CLI arguments
"""
logging.basicConfig(level=logging.INFO)
parser = ArgumentParser()
parser.add_argument('-i', help='Path to the annotated VCF')
parser.add_argument('-o', help='Root to export PM5 table and JSON to')
args = parser.parse_args()
main(input_vcf=args.i, output_root=args.o)


def main(input_vcf: str, output_root: str):
"""
Args:
input_vcf (str): path to an input vcf
output_root ():
"""

# crack open a cold VCF, and have a sip
vcf_reader = VCF(input_vcf)

# find the header encoding all the VEP fields
header_csq = pull_vep_from_header(vcf_reader)

clinvar_dict = defaultdict(set)

# iterate over the variants
for variant in vcf_reader:
# extract the clinvar details (added in previous script)
clinvar_allele = variant.INFO['allele_id']
clinvar_stars = variant.INFO['gold_stars']
clinvar_key = f'{clinvar_allele}:{clinvar_stars}'

# iterate over all missense consequences
for csq_dict in variant_consequences(variant, header_csq):
# add this clinvar entry in relation to the protein consequence
protein_key = f"{csq_dict['ensp']}:{csq_dict['protein_position']}"
clinvar_dict[protein_key].add(clinvar_key)

# save the dictionary locally
json_out_path = f'{output_root}.json'
with open(json_out_path, 'w') as f:
for key, value in clinvar_dict.items():
new_dict = {'newkey': key, 'clinvar_alleles': '+'.join(value)}
f.write(f'{json.dumps(new_dict)}\n')

logging.info(f'JSON written to {json_out_path}')

# now set a schema to read that into a table... if you want hail
schema = hl.dtype('struct{newkey:str,clinvar_alleles:str}')

# import the table, and transmute to top-level attributes
ht = hl.import_table(json_out_path, no_header=True, types={'f0': schema})
ht = ht.transmute(**ht.f0)
ht = ht.key_by(ht.newkey)

# write out
ht.write(f'{output_root}.ht', overwrite=True)
logging.info(f'Hail Table written to {output_root}.ht')


if __name__ == '__main__':
cli_main()
9 changes: 4 additions & 5 deletions clinvarbitration/clinvar_by_codon_from_mt.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
"""
Method file for re-sorting clinvar annotations by codon
Method file for re-sorting clinvar annotations by codon (taking annotated MatrixTable as input)
This makes the assumption that the annotated data here
has been generated by summarise_clinvar_entries.py:
This makes the assumption that the annotated data here has been generated by summarise_clinvar_entries.py:
- SNV only
- Clinvar Pathogenic only
- ClinVar decision/alleles/gold stars are in INFO
In almost all use-cases the alternative form based on annotated VCFs
will be used in place of this, but it's retained here just in case.
In almost all use-cases the alternative form based on annotated VCFs will be used in place of this,
but it's retained here just in case.
"""

from argparse import ArgumentParser
Expand Down
116 changes: 0 additions & 116 deletions clinvarbitration/clinvar_by_codon_from_vcf.py

This file was deleted.

Loading

0 comments on commit 7b7bb44

Please sign in to comment.