generated from populationgenomics/cpg-python-template-repo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Corrections to Allele/Var ID logic (#4)
- Loading branch information
1 parent
6611e68
commit 7b7bb44
Showing
13 changed files
with
261 additions
and
195 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
[bumpversion] | ||
current_version = 1.0.0 | ||
current_version = 1.2.0 | ||
commit = True | ||
tag = False | ||
|
||
[bumpversion:file:setup.py] | ||
|
||
[bumpversion:file:Dockerfile] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,18 @@ | ||
FROM hailgenetics/hail:0.2.127-py3.11 | ||
FROM python:3.10-bullseye | ||
|
||
COPY scripts /scripts | ||
COPY requirements.txt /scripts/ | ||
# take as a command line argument, or | ||
ARG RELEASE=${RELEASE:-1.2.0} | ||
|
||
RUN pip install --no-cache-dir -r /scripts/requirements.txt | ||
RUN apt update && apt install -y \ | ||
apt-transport-https \ | ||
bzip2 \ | ||
ca-certificates \ | ||
git \ | ||
gnupg \ | ||
openjdk-11-jdk-headless \ | ||
wget \ | ||
zip && \ | ||
rm -r /var/lib/apt/lists/* && \ | ||
rm -r /var/cache/apt/* | ||
|
||
WORKDIR /scripts | ||
RUN pip install --no-cache-dir git+https://github.com/populationgenomics/ClinvArbitration.git@${RELEASE} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
""" | ||
Method file for re-sorting clinvar annotations by codon | ||
Takes a VCF of annotated Pathogenic Clinvar Variants | ||
re-indexes the data to be queryable on Transcript and Codon | ||
writes the resulting Hail Table to the specified path | ||
Data as input for this script should be a VCF, annotated by VEP 110 | ||
Compatibility with other versions of VEP is not guaranteed | ||
This makes the assumption that the annotated data here | ||
has been generated by summarise_clinvar_entries.py: | ||
- SNV only | ||
- Clinvar Pathogenic only | ||
- ClinVar decision/alleles/gold stars are in INFO | ||
""" | ||
|
||
import json | ||
import logging | ||
from argparse import ArgumentParser | ||
from collections import defaultdict | ||
|
||
import hail as hl | ||
from cyvcf2 import VCF, Variant | ||
|
||
|
||
def pull_vep_from_header(vcf: VCF) -> list[str]: | ||
""" | ||
yank the CSQ line out of the VCF header | ||
""" | ||
for element in vcf.header_iter(): | ||
if element['HeaderType'] == 'INFO' and element['ID'] == 'CSQ': | ||
return list(entry.lower() for entry in element['Description'].split('Format: ')[-1].rstrip('"').split('|')) | ||
raise IndexError('CSQ element not found in header') | ||
|
||
|
||
def variant_consequences(variant: Variant, csq_header: list[str]) -> list[dict[str, str]]: | ||
""" | ||
extracts the consequences for each transcript in this variant | ||
Args: | ||
variant (Variant): | ||
csq_header (): | ||
Returns: | ||
a list of all CSQ entries, cast as a dict | ||
""" | ||
|
||
consequences: list[dict[str, str]] = [] | ||
for csq in variant.INFO['CSQ'].split(','): | ||
csq_dict = dict(zip(csq_header, csq.split('|'), strict=True)) | ||
if 'missense_variant' in csq_dict['consequence']: | ||
consequences.append(csq_dict) | ||
return consequences | ||
|
||
|
||
def cli_main(): | ||
""" | ||
alternative access point with CLI arguments | ||
""" | ||
logging.basicConfig(level=logging.INFO) | ||
parser = ArgumentParser() | ||
parser.add_argument('-i', help='Path to the annotated VCF') | ||
parser.add_argument('-o', help='Root to export PM5 table and JSON to') | ||
args = parser.parse_args() | ||
main(input_vcf=args.i, output_root=args.o) | ||
|
||
|
||
def main(input_vcf: str, output_root: str): | ||
""" | ||
Args: | ||
input_vcf (str): path to an input vcf | ||
output_root (): | ||
""" | ||
|
||
# crack open a cold VCF, and have a sip | ||
vcf_reader = VCF(input_vcf) | ||
|
||
# find the header encoding all the VEP fields | ||
header_csq = pull_vep_from_header(vcf_reader) | ||
|
||
clinvar_dict = defaultdict(set) | ||
|
||
# iterate over the variants | ||
for variant in vcf_reader: | ||
# extract the clinvar details (added in previous script) | ||
clinvar_allele = variant.INFO['allele_id'] | ||
clinvar_stars = variant.INFO['gold_stars'] | ||
clinvar_key = f'{clinvar_allele}:{clinvar_stars}' | ||
|
||
# iterate over all missense consequences | ||
for csq_dict in variant_consequences(variant, header_csq): | ||
# add this clinvar entry in relation to the protein consequence | ||
protein_key = f"{csq_dict['ensp']}:{csq_dict['protein_position']}" | ||
clinvar_dict[protein_key].add(clinvar_key) | ||
|
||
# save the dictionary locally | ||
json_out_path = f'{output_root}.json' | ||
with open(json_out_path, 'w') as f: | ||
for key, value in clinvar_dict.items(): | ||
new_dict = {'newkey': key, 'clinvar_alleles': '+'.join(value)} | ||
f.write(f'{json.dumps(new_dict)}\n') | ||
|
||
logging.info(f'JSON written to {json_out_path}') | ||
|
||
# now set a schema to read that into a table... if you want hail | ||
schema = hl.dtype('struct{newkey:str,clinvar_alleles:str}') | ||
|
||
# import the table, and transmute to top-level attributes | ||
ht = hl.import_table(json_out_path, no_header=True, types={'f0': schema}) | ||
ht = ht.transmute(**ht.f0) | ||
ht = ht.key_by(ht.newkey) | ||
|
||
# write out | ||
ht.write(f'{output_root}.ht', overwrite=True) | ||
logging.info(f'Hail Table written to {output_root}.ht') | ||
|
||
|
||
if __name__ == '__main__': | ||
cli_main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.