diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 7646069..3d79891 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,6 +1,8 @@ [bumpversion] -current_version = 1.0.0 +current_version = 1.2.0 commit = True tag = False [bumpversion:file:setup.py] + +[bumpversion:file:Dockerfile] diff --git a/Dockerfile b/Dockerfile index c3b96a5..5134ee2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,18 @@ -FROM hailgenetics/hail:0.2.127-py3.11 +FROM python:3.10-bullseye -COPY scripts /scripts -COPY requirements.txt /scripts/ +# take as a command line argument, or +ARG RELEASE=${RELEASE:-1.2.0} -RUN pip install --no-cache-dir -r /scripts/requirements.txt +RUN apt update && apt install -y \ + apt-transport-https \ + bzip2 \ + ca-certificates \ + git \ + gnupg \ + openjdk-11-jdk-headless \ + wget \ + zip && \ + rm -r /var/lib/apt/lists/* && \ + rm -r /var/cache/apt/* -WORKDIR /scripts +RUN pip install --no-cache-dir git+https://github.com/populationgenomics/ClinvArbitration.git@${RELEASE} diff --git a/clinvarbitration/clinvar_by_codon.py b/clinvarbitration/clinvar_by_codon.py new file mode 100644 index 0000000..a1b4053 --- /dev/null +++ b/clinvarbitration/clinvar_by_codon.py @@ -0,0 +1,122 @@ +""" +Method file for re-sorting clinvar annotations by codon + +Takes a VCF of annotated Pathogenic Clinvar Variants +re-indexes the data to be queryable on Transcript and Codon +writes the resulting Hail Table to the specified path + +Data as input for this script should be a VCF, annotated by VEP 110 +Compatibility with other versions of VEP is not guaranteed + +This makes the assumption that the annotated data here +has been generated by summarise_clinvar_entries.py: + +- SNV only +- Clinvar Pathogenic only +- ClinVar decision/alleles/gold stars are in INFO +""" + +import json +import logging +from argparse import ArgumentParser +from collections import defaultdict + +import hail as hl +from cyvcf2 import VCF, Variant + + +def pull_vep_from_header(vcf: VCF) -> list[str]: + """ + yank the CSQ line out of the VCF header + """ + for element in vcf.header_iter(): + if element['HeaderType'] == 'INFO' and element['ID'] == 'CSQ': + return list(entry.lower() for entry in element['Description'].split('Format: ')[-1].rstrip('"').split('|')) + raise IndexError('CSQ element not found in header') + + +def variant_consequences(variant: Variant, csq_header: list[str]) -> list[dict[str, str]]: + """ + extracts the consequences for each transcript in this variant + + Args: + variant (Variant): + csq_header (): + + Returns: + a list of all CSQ entries, cast as a dict + """ + + consequences: list[dict[str, str]] = [] + for csq in variant.INFO['CSQ'].split(','): + csq_dict = dict(zip(csq_header, csq.split('|'), strict=True)) + if 'missense_variant' in csq_dict['consequence']: + consequences.append(csq_dict) + return consequences + + +def cli_main(): + """ + alternative access point with CLI arguments + """ + logging.basicConfig(level=logging.INFO) + parser = ArgumentParser() + parser.add_argument('-i', help='Path to the annotated VCF') + parser.add_argument('-o', help='Root to export PM5 table and JSON to') + args = parser.parse_args() + main(input_vcf=args.i, output_root=args.o) + + +def main(input_vcf: str, output_root: str): + """ + + Args: + input_vcf (str): path to an input vcf + output_root (): + """ + + # crack open a cold VCF, and have a sip + vcf_reader = VCF(input_vcf) + + # find the header encoding all the VEP fields + header_csq = pull_vep_from_header(vcf_reader) + + clinvar_dict = defaultdict(set) + + # iterate over the variants + for variant in vcf_reader: + # extract the clinvar details (added in previous script) + clinvar_allele = variant.INFO['allele_id'] + clinvar_stars = variant.INFO['gold_stars'] + clinvar_key = f'{clinvar_allele}:{clinvar_stars}' + + # iterate over all missense consequences + for csq_dict in variant_consequences(variant, header_csq): + # add this clinvar entry in relation to the protein consequence + protein_key = f"{csq_dict['ensp']}:{csq_dict['protein_position']}" + clinvar_dict[protein_key].add(clinvar_key) + + # save the dictionary locally + json_out_path = f'{output_root}.json' + with open(json_out_path, 'w') as f: + for key, value in clinvar_dict.items(): + new_dict = {'newkey': key, 'clinvar_alleles': '+'.join(value)} + f.write(f'{json.dumps(new_dict)}\n') + + logging.info(f'JSON written to {json_out_path}') + + # now set a schema to read that into a table... if you want hail + schema = hl.dtype('struct{newkey:str,clinvar_alleles:str}') + + # import the table, and transmute to top-level attributes + ht = hl.import_table(json_out_path, no_header=True, types={'f0': schema}) + ht = ht.transmute(**ht.f0) + ht = ht.key_by(ht.newkey) + + # write out + ht.write(f'{output_root}.ht', overwrite=True) + logging.info(f'Hail Table written to {output_root}.ht') + + +if __name__ == '__main__': + cli_main() diff --git a/clinvarbitration/clinvar_by_codon_from_mt.py b/clinvarbitration/clinvar_by_codon_from_mt.py index 64e921e..171b0b4 100644 --- a/clinvarbitration/clinvar_by_codon_from_mt.py +++ b/clinvarbitration/clinvar_by_codon_from_mt.py @@ -1,15 +1,14 @@ """ -Method file for re-sorting clinvar annotations by codon +Method file for re-sorting clinvar annotations by codon (taking annotated MatrixTable as input) -This makes the assumption that the annotated data here -has been generated by summarise_clinvar_entries.py: +This makes the assumption that the annotated data here has been generated by summarise_clinvar_entries.py: - SNV only - Clinvar Pathogenic only - ClinVar decision/alleles/gold stars are in INFO -In almost all use-cases the alternative form based on annotated VCFs -will be used in place of this, but it's retained here just in case. +In almost all use-cases the alternative form based on annotated VCFs will be used in place of this, +but it's retained here just in case. """ from argparse import ArgumentParser diff --git a/clinvarbitration/clinvar_by_codon_from_vcf.py b/clinvarbitration/clinvar_by_codon_from_vcf.py deleted file mode 100644 index d5e639e..0000000 --- a/clinvarbitration/clinvar_by_codon_from_vcf.py +++ /dev/null @@ -1,116 +0,0 @@ -""" -Method file for re-sorting clinvar annotations by codon - -Data as input for this script should be a VCF, annotated by VEP 110 -Compatibility with other versions of VEP is not guaranteed - -This makes the assumption that the annotated data here -has been generated by summarise_clinvar_entries.py: - -- SNV only -- Clinvar Pathogenic only -- ClinVar decision/alleles/gold stars are in INFO -""" - -import json -from argparse import ArgumentParser -from collections import defaultdict - -import hail as hl -from cyvcf2 import VCF, Variant - -""" -Takes a VCF of annotated Pathogenic Clinvar Variants -re-indexes the data to be queryable on Transcript and Codon -writes the resulting Hail Table to the specified path -""" - -# update these values to reflect the VEP version in use -# the first should be the ENSP ID -# the second should be the residue in the protein -PROTEIN_ID = 'ensp' -PROTEIN_POSITION = 'protein_position' - - -def pull_vep_from_header(vcf: VCF) -> list[str]: - """ - yank the CSQ line out of the VCF header - """ - for element in vcf.header_iter(): - if element['HeaderType'] == 'INFO' and element['ID'] == 'CSQ': - return list( - map( - str.lower, - element['Description'].split('Format: ')[-1].rstrip('"').split('|'), - ), - ) - raise IndexError('CSQ element not found in header') - - -def variant_consequences( - variant: Variant, - csq_header: list[str], -) -> list[dict[str, str]]: - """ - extracts the consequences for each transcript in this variant - - Args: - variant (Variant): - csq_header (): - - Returns: - a list of all CSQ entries, cast as a dict - """ - - consequences: list[dict[str, str]] = [] - for csq in variant.INFO['CSQ'].split(','): - csq_dict = dict(zip(csq_header, csq.split('|'), strict=True)) - if 'missense_variant' in csq_dict['consequence']: - consequences.append(csq_dict) - return consequences - - -parser = ArgumentParser() -parser.add_argument('-i', help='Path to the annotated VCF') -parser.add_argument('-o', help='Root to export PM5 table and JSON to') -args = parser.parse_args() - -# crack open a cold VCF, and have a sip -vcf_reader = VCF(args.i) -# find the header encoding all the VEP fields -header_csq = pull_vep_from_header(vcf_reader) - -clinvar_dict = defaultdict(set) - -# iterate over the variants -for variant in vcf_reader: - # extract the clinvar details (added in previous script) - clinvar_allele = variant.INFO['allele_id'] - clinvar_stars = variant.INFO['gold_stars'] - clinvar_key = f'{clinvar_allele}:{clinvar_stars}' - - # iterate over all missense consequences - for csq_dict in variant_consequences(variant, header_csq): - # add this clinvar entry in relation to the protein consequence - protein_key = f"{csq_dict[PROTEIN_ID]}:{csq_dict[PROTEIN_POSITION]}" - clinvar_dict[protein_key].add(clinvar_key) - -# save the dictionary locally -json_out_path = f'{args.o}.json' -with open(json_out_path, 'w') as f: - for key, value in clinvar_dict.items(): - new_dict = {'newkey': key, 'clinvar_alleles': '+'.join(value)} - f.write(f'{json.dumps(new_dict)}\n') -print(f'JSON written to {json_out_path}') - -# now set a schema to read that into a table... if you want hail -schema = hl.dtype('struct{newkey:str,clinvar_alleles:str}') - -# import the table, and transmute to top-level attributes -ht = hl.import_table(json_out_path, no_header=True, types={'f0': schema}) -ht = ht.transmute(**ht.f0) -ht = ht.key_by(ht.newkey) - -# write out -ht.write(f'{args.o}.ht', overwrite=True) -print(f'Hail Table written to {args.o}.ht') diff --git a/clinvarbitration/resummarise.py b/clinvarbitration/resummarise_clinvar.py similarity index 83% rename from clinvarbitration/resummarise.py rename to clinvarbitration/resummarise_clinvar.py index d9b8ff2..a610275 100644 --- a/clinvarbitration/resummarise.py +++ b/clinvarbitration/resummarise_clinvar.py @@ -52,16 +52,19 @@ # I really want the linter to just tolerate naive datetimes, but it won't TIMEZONE = zoneinfo.ZoneInfo('Australia/Brisbane') + # published Nov 2015, available pre-print since March 2015 # assumed to be influential since 2016 - ACMG_THRESHOLD = datetime(year=2016, month=1, day=1, tzinfo=TIMEZONE) + +# a default date assigned to un-dated entries VERY_OLD = datetime(year=1970, month=1, day=1, tzinfo=TIMEZONE) + LARGEST_COMPLEX_INDELS = 40 BASES = re.compile(r'[ACGTN]+') # add the exact name of any submitters whose evidence is not trusted -BLACKLIST: list[str] = [] +BLACKLIST: set[str] = set() class Consequence(Enum): @@ -130,14 +133,8 @@ def get_allele_locus_map(summary_file: str) -> dict: if chromosome not in ORDERED_ALLELES: continue - # skip chromosomal deletions and insertions, mito, or massive indels - if ( - ref == 'na' - or alt == 'na' - or ref == alt - or 'm' in chromosome.lower() - or (len(ref) + len(alt)) > LARGEST_COMPLEX_INDELS - ): + # skip chromosomal deletions and insertions, or massive indels + if ref == 'na' or alt == 'na' or ref == alt or (len(ref) + len(alt)) > LARGEST_COMPLEX_INDELS: continue # don't include any of the trash bases in ClinVar @@ -187,12 +184,7 @@ def consequence_decision(subs: list[Submission]) -> Consequence: decision = Consequence.UNCERTAIN # establish counts for this allele - counts = { - Consequence.BENIGN: 0, - Consequence.PATHOGENIC: 0, - Consequence.UNCERTAIN: 0, - 'total': 0, - } + counts = {Consequence.BENIGN: 0, Consequence.PATHOGENIC: 0, Consequence.UNCERTAIN: 0, 'total': 0} for each_sub in subs: # for 3/4-star ratings, don't look any further @@ -200,11 +192,7 @@ def consequence_decision(subs: list[Submission]) -> Consequence: return each_sub.classification counts['total'] += 1 - if each_sub.classification in [ - Consequence.PATHOGENIC, - Consequence.BENIGN, - Consequence.UNCERTAIN, - ]: + if each_sub.classification in [Consequence.PATHOGENIC, Consequence.BENIGN, Consequence.UNCERTAIN]: counts[each_sub.classification] += 1 if counts[Consequence.PATHOGENIC] and counts[Consequence.BENIGN]: @@ -263,7 +251,7 @@ def check_stars(subs: list[Submission]) -> int: return minimum -def process_line(data: list[str]) -> tuple[int, Submission]: +def process_submission_line(data: list[str]) -> tuple[int, Submission]: """ takes a line, strips out useful content as a 'Submission' @@ -273,7 +261,7 @@ def process_line(data: list[str]) -> tuple[int, Submission]: Returns: the allele ID and corresponding Submission details """ - allele_id = int(data[0]) + var_id = int(data[0]) if data[1] in PATH_SIGS: classification = Consequence.PATHOGENIC elif data[1] in BENIGN_SIGS: @@ -286,7 +274,7 @@ def process_line(data: list[str]) -> tuple[int, Submission]: sub = data[9].lower() rev_status = data[6].lower() - return allele_id, Submission(date, sub, classification, rev_status) + return var_id, Submission(date, sub, classification, rev_status) def dict_list_to_ht(list_of_dicts: list) -> hl.Table: @@ -307,7 +295,7 @@ def dict_list_to_ht(list_of_dicts: list) -> hl.Table: return hl.Table.from_pandas(pdf, key=['locus', 'alleles']) -def get_all_decisions(submission_file: str, allele_ids: set[int]) -> dict[int, list[Submission]]: +def get_all_decisions(submission_file: str, var_ids: set[int]) -> dict[int, list[Submission]]: """ obtains all submissions per-allele which pass basic criteria - not a blacklisted submitter @@ -315,23 +303,22 @@ def get_all_decisions(submission_file: str, allele_ids: set[int]) -> dict[int, l Args: submission_file (): file containing submission-per-line - allele_ids (): only process alleleIDs we have pos data for + var_ids (): only process Var IDs we have pos data for Returns: - dictionary of alleles and their corresponding submissions + dictionary of var IDs and their corresponding submissions """ submission_dict = defaultdict(list) for line in lines_from_gzip(submission_file): - a_id, line_sub = process_line(line) + var_id, line_sub = process_submission_line(line) # skip rows where the variantID isn't in this mapping # this saves a little effort on haplotypes, CNVs, and SVs if ( - (a_id not in allele_ids) + (var_id not in var_ids) or (line_sub.submitter in BLACKLIST) - or (line_sub.review_status in USELESS_RATINGS) or (line_sub.classification == Consequence.UNKNOWN) ): continue @@ -341,7 +328,7 @@ def get_all_decisions(submission_file: str, allele_ids: set[int]) -> dict[int, l if line_sub.classification == consequence and line_sub.submitter in submitters: continue - submission_dict[a_id].append(line_sub) + submission_dict[var_id].append(line_sub) return submission_dict @@ -402,9 +389,7 @@ def parse_into_table(json_path: str, out_path: str) -> hl.Table: # start a hail runtime hl.init(default_reference='GRCh38') - - # # may need this as a subsequent line, depending on the Hail version being used - # hl.default_reference(hl.get_reference('GRCh38')) # noqa: ERA001 + # hl.context.init_local(default_reference='GRCh38') # define the schema for each written line schema = hl.dtype( @@ -428,7 +413,9 @@ def parse_into_table(json_path: str, out_path: str) -> hl.Table: # write out to the specified location ht.write(f'{out_path}.ht', overwrite=True) - return ht + + # read the localised version + return hl.read_table(f'{out_path}.ht') def write_vep_vcf(clinvar_table: hl.Table, output_root: str): @@ -486,7 +473,45 @@ def snv_missense_filter(clinvar_table: hl.Table, output_root: str): logging.info(f'Wrote SNV VCF to {vcf_path}') -def main(subs: str, variants: str, output_root: str): +def cli_main(): + logging.basicConfig(level=logging.INFO) + parser = ArgumentParser() + parser.add_argument('-s', help='submission_summary.txt.gz from NCBI', required=True) + parser.add_argument('-v', help='variant_summary.txt.gz from NCBI', required=True) + parser.add_argument('-o', help='output root, for table, json, and path-only VCF', required=True) + parser.add_argument('--minimal', help='only keep path. and 1+ star benign', action='store_true') + parser.add_argument('-b', help='sites to blacklist', nargs='+', default=[]) + args = parser.parse_args() + + # if sites are blacklisted on the CLI, update the global BLACKLIST value + # temporary solution while we continue to validate Talos + if args.b: + BLACKLIST.update(args.b) + + main(subs=args.s, variants=args.v, output_root=args.o, minimal=args.minimal) + + +def only_keep_talos_relevant_entries(results: list[dict]) -> list[dict]: + """ + filters the results to only those used in Talos: + - all Pathogenic ratings + - all Benign with >= 1 Star + + Args: + results (list[dict]): all results + + Returns: + the same results, but reduced + """ + return [ + result + for result in results + if (result['clinical_significance'] == Consequence.PATHOGENIC.value) + or ((result['clinical_significance'] == Consequence.BENIGN.value) and (result['gold_stars'] > 0)) + ] + + +def main(subs: str, variants: str, output_root: str, minimal: bool): """ Redefines what it is to be a clinvar summary @@ -494,21 +519,23 @@ def main(subs: str, variants: str, output_root: str): subs (str): file path to all submissions (gzipped) variants (str): file path to variant summary (gzipped) output_root (str): path to write JSON out to + minimal (bool): only keep the talos-relevant entries """ logging.info('Getting alleleID-VariantID-Loci from variant summary') allele_map = get_allele_locus_map(variants) - logging.info('Getting all decisions, indexed on clinvar AlleleID') + logging.info('Getting all decisions, indexed on clinvar Var ID') + # the raw IDs - some have ambiguous X/Y mappings all_uniq_ids = {x['var_id'] for x in allele_map.values()} - decision_dict = get_all_decisions(submission_file=subs, allele_ids=all_uniq_ids) + decision_dict = get_all_decisions(submission_file=subs, var_ids=all_uniq_ids) # placeholder to fill wth per-allele decisions all_decisions = {} # now filter each set of decisions per allele - for allele_id, submissions in decision_dict.items(): + for var_id, submissions in decision_dict.items(): # filter against ACMG date, if appropriate filtered_submissions = acmg_filter_submissions(submissions) @@ -525,11 +552,12 @@ def main(subs: str, variants: str, output_root: str): if rating in [Consequence.UNCERTAIN, Consequence.UNKNOWN]: continue - all_decisions[allele_id] = (rating, stars) + all_decisions[var_id] = (rating, stars) # now match those up with the variant coordinates + logging.info('Matching decisions to variant coordinates') complete_decisions = [] - for var_details in allele_map.values(): + for uniq_var_id, var_details in allele_map.items(): var_id = var_details['var_id'] # we may have found no relevant submissions for this variant @@ -544,10 +572,17 @@ def main(subs: str, variants: str, output_root: str): 'position': var_details['pos'], 'clinical_significance': all_decisions[var_id][0].value, 'gold_stars': all_decisions[var_details['var_id']][1], - 'allele_id': var_id, + 'allele_id': allele_map[uniq_var_id]['allele'], }, ) + # optionally, filter to just minimal useful entries + if minimal: + logging.info('Producing the reduced output set - Pathogenic and Strong Benign') + complete_decisions = only_keep_talos_relevant_entries(complete_decisions) + + logging.info(f'{len(complete_decisions)} ClinVar entries remain') + # sort all collected decisions, trying to reduce overhead in HT later complete_decisions_sorted = sort_decisions(complete_decisions) @@ -561,6 +596,7 @@ def main(subs: str, variants: str, output_root: str): for each_dict in complete_decisions_sorted: handle.write(f'{json.dumps(each_dict)}\n') + logging.info('JSON written to file, parsing into a Hail Table') ht = parse_into_table(json_path=json_output, out_path=output_root) # export this table of decisions as a tabix-indexed VCF @@ -572,12 +608,4 @@ def main(subs: str, variants: str, output_root: str): if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) - - parser = ArgumentParser() - parser.add_argument('-s', help='submission_summary.txt.gz from NCBI', required=True) - parser.add_argument('-v', help='variant_summary.txt.gz from NCBI', required=True) - parser.add_argument('-o', help='output root, for table, json, and pathogenic-variants-only VCF', required=True) - args = parser.parse_args() - - main(subs=args.s, variants=args.v, output_root=args.o) + cli_main() diff --git a/data/pathogenic_annotated.vcf.bgz b/data/pathogenic_annotated.vcf.bgz index a233858..20030d8 100644 Binary files a/data/pathogenic_annotated.vcf.bgz and b/data/pathogenic_annotated.vcf.bgz differ diff --git a/data/pathogenic_annotated.vcf.bgz.tbi b/data/pathogenic_annotated.vcf.bgz.tbi index f0beb00..98eb0f7 100644 Binary files a/data/pathogenic_annotated.vcf.bgz.tbi and b/data/pathogenic_annotated.vcf.bgz.tbi differ diff --git a/example_script.sh b/example_script.sh index 85294c5..b6575f7 100644 --- a/example_script.sh +++ b/example_script.sh @@ -3,19 +3,19 @@ set -ex # create a docker image from this repository -docker build -t hail_clinvar:example --platform linux/amd64 . +docker build --platform linux/arm64/v8 -t clinvarbitration:example --platform linux/amd64 . # make local copies of the NCBI data files required as input using wget # create a directory called data, if one doesn't already exist if [ ! -d data ]; then mkdir data fi -wget -O data/variant_summary.txt.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz -wget -O data/submission_summary.txt.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz +#wget -O data/variant_summary.txt.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz +#wget -O data/submission_summary.txt.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz # run the docker image to generate the summarised output -docker run --platform linux/amd64 -v "$(pwd)/data":/data hail_clinvar:example \ - /bin/bash -c "python3 /scripts/resummarise.py -v /data/variant_summary.txt.gz -s /data/submission_summary.txt.gz -o /data/clinvar_summary" +docker run -v "$(pwd)/data":/data clinvarbitration:example \ + resummary -v "/data/variant_summary.txt.gz" -s "/data/submission_summary.txt.gz" -o "/data/clinvar_summary" --minimal # upon completion, this will have generated files in the data directory: # - data/clinvar_summary.json - a JSON file containing the summarised data entries, one json object per line @@ -23,11 +23,11 @@ docker run --platform linux/amd64 -v "$(pwd)/data":/data hail_clinvar:example \ # - data/clinvar_summary.vcf.bgz - a bgzipped file containing the pathogenic SNV entries in VCF format # - data/clinvar_summary.ht - a Hail Table containing the summarised data entries -# This is where you should run VEP on data/clinvar_summary.vcf.bgz, with protein consequence annotation per transcript -# Let's imagine you did that, and the result is in data/pathogenic_annotated.vcf.bgz -# I've enclosed a 10-variant example of this, as annotated by https://www.ensembl.org/Homo_sapiens/Tools/VEP -docker run --platform linux/amd64 -v "$(pwd)/data":/data hail_clinvar:example \ - /bin/bash -c "python3 /scripts/clinvar_by_codon_from_vcf.py -i /data/pathogenic_annotated.vcf.bgz -o /data/pm5" +## This is where you should run VEP on data/clinvar_summary.vcf.bgz, with protein consequence annotation per transcript +## Let's imagine you did that, and the result is in data/pathogenic_annotated.vcf.bgz +## I've enclosed a 10-variant example of this, as annotated by https://www.ensembl.org/Homo_sapiens/Tools/VEP +#docker run --platform linux/amd64 -v "$(pwd)/data":/data clinvarbitration:example \ +# /bin/bash -c "python3 /clinvarbitration/clinvar_by_codon.py -i /data/pathogenic_annotated.vcf.bgz -o /data/pm5" # upon completion, this will generate files in the data directory: # - data/pm5.json - a JSON file containing the PM5 results, one JSON object per line diff --git a/pull_request_template.md b/pull_request_template.md new file mode 100644 index 0000000..2dd1fb9 --- /dev/null +++ b/pull_request_template.md @@ -0,0 +1,14 @@ +# Purpose + + - < The reason for this PR > + +## Proposed Changes + + - + - + +## Checklist + +- [ ] Related GitHub Issue created +- [ ] Tests covering new change +- [ ] Linting checks pass diff --git a/requirements.txt b/requirements.txt index 3990e25..f6460ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ cyvcf2>=0.30.28 -hail==0.2.127 +hail>=0.2.128 pandas>=2.0.3 +pyspark>=3.3.3 diff --git a/setup.py b/setup.py index 5da99d9..83eb3ce 100644 --- a/setup.py +++ b/setup.py @@ -23,12 +23,12 @@ def read_reqs(filename: str) -> list[str]: setup( - name='ClinvArbitration', + name='clinvarbitration', description='CPG ClinVar Re-interpretation', long_description=readme, - version='1.0.0', + version='1.2.0', author='Matthew Welland, CPG', - author_email=('matthew.welland@populationgenomics.org.au, ' 'cas.simons@populationgenomics.org.au'), + author_email='matthew.welland@populationgenomics.org.au, cas.simons@populationgenomics.org.au', url='https://github.com/populationgenomics/ClinvArbitration', license='MIT', classifiers=[ @@ -46,7 +46,13 @@ def read_reqs(filename: str) -> list[str]: packages=find_packages(), include_package_data=True, install_requires=read_reqs('requirements.txt'), - extras_require={ - 'test': read_reqs('requirements-dev.txt'), + extras_require={'test': read_reqs('requirements-dev.txt')}, + entry_points={ + 'console_scripts': [ + # Step 1; re-summarise ClinVar using altered conflict resolution + 'resummary = clinvarbitration.resummarise_clinvar:cli_main', + # Step 2, post-annotation; obtain PM5 annotations from VEP annotated clinvar + 'pm5_table = clinvarbitration.clinvar_by_codon:cli_main', + ], }, ) diff --git a/test/test_resummarise.py b/test/test_resummarise.py index c485ef7..11dc5a5 100644 --- a/test/test_resummarise.py +++ b/test/test_resummarise.py @@ -4,7 +4,7 @@ import pytest import zoneinfo -from clinvarbitration.resummarise import Consequence, Submission, consequence_decision +from clinvarbitration.resummarise_clinvar import Consequence, Submission, consequence_decision TIMEZONE = zoneinfo.ZoneInfo('Australia/Brisbane') BASIC_SUB = Submission(datetime.now(tz=TIMEZONE), 'foo', Consequence.UNKNOWN, 'review')