-
Notifications
You must be signed in to change notification settings - Fork 598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VS-815: Add Support for YNG to VQSR Lite #8206
Changes from 15 commits
7274e01
e89990f
ba8ccca
7d728f6
1cb5152
6afe873
4a6ae49
effa0f9
ff6aee6
3eb092c
71c3179
21983a1
c6bf06a
8b39a3b
3eb44c1
fe92f0e
821b4b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
version 1.0 | ||
|
||
# I am a comment. | ||
|
||
workflow GvsCalculatePrecisionAndSensitivity { | ||
input { | ||
Array[File] input_vcfs | ||
|
@@ -13,6 +15,8 @@ workflow GvsCalculatePrecisionAndSensitivity { | |
Array[File] truth_beds | ||
|
||
File ref_fasta | ||
|
||
Boolean use_classic_VQSR = true | ||
} | ||
|
||
parameter_meta { | ||
|
@@ -24,6 +28,7 @@ workflow GvsCalculatePrecisionAndSensitivity { | |
truth_vcf_indices: "A list of the VCF indices for the truth data VCFs supplied above." | ||
truth_beds: "A list of the bed files for the truth data used for analyzing the samples in `sample_names`." | ||
ref_fasta: "The cloud path for the reference fasta sequence." | ||
use_classic_VQSR: "Flag to indicate whether the input VCFs were generated using classic (standard) VQSR or 'VQSR Lite'." | ||
} | ||
|
||
# WDL 1.0 trick to set a variable ('none') to be undefined. | ||
|
@@ -75,15 +80,16 @@ workflow GvsCalculatePrecisionAndSensitivity { | |
output_basename = output_sample_basename | ||
} | ||
|
||
call Add_AS_MAX_VQSLOD_ToVcf { | ||
call Add_AS_MAX_VQS_Score_ToVcf { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it VQS score? not VQSR? Doesn't the S for "score" ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah - This label is very specific to this WDL and I was just looking for a generic term to use for lod and calibration_sensitivity. I'll change it up. |
||
input: | ||
input_vcf = SelectVariants.output_vcf, | ||
output_basename = output_sample_basename + ".maxas" | ||
output_basename = output_sample_basename + ".maxas", | ||
use_classic_VQSR = use_classic_VQSR | ||
} | ||
|
||
call BgzipAndTabix { | ||
input: | ||
input_vcf = Add_AS_MAX_VQSLOD_ToVcf.output_vcf, | ||
input_vcf = Add_AS_MAX_VQS_Score_ToVcf.output_vcf, | ||
output_basename = output_sample_basename + ".maxas" | ||
} | ||
|
||
|
@@ -96,6 +102,7 @@ workflow GvsCalculatePrecisionAndSensitivity { | |
truth_bed = truth_beds[i], | ||
contig = contig, | ||
output_basename = sample_name + "-bq_roc_filtered", | ||
use_classic_VQSR = use_classic_VQSR, | ||
ref_fasta = ref_fasta | ||
} | ||
|
||
|
@@ -109,6 +116,7 @@ workflow GvsCalculatePrecisionAndSensitivity { | |
contig = contig, | ||
all_records = true, | ||
output_basename = sample_name + "-bq_all", | ||
use_classic_VQSR = use_classic_VQSR, | ||
ref_fasta = ref_fasta | ||
} | ||
} | ||
|
@@ -259,21 +267,25 @@ task SelectVariants { | |
} | ||
} | ||
|
||
task Add_AS_MAX_VQSLOD_ToVcf { | ||
task Add_AS_MAX_VQS_Score_ToVcf { | ||
input { | ||
File input_vcf | ||
String output_basename | ||
Boolean use_classic_VQSR | ||
|
||
String docker = "us.gcr.io/broad-dsde-methods/variantstore:2023-01-23-alpine" | ||
String docker = "us.gcr.io/broad-dsde-methods/variantstore:2023-02-21-alpine" | ||
Int cpu = 1 | ||
Int memory_mb = 3500 | ||
Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50 | ||
} | ||
|
||
String score_tool = if (use_classic_VQSR == true) then 'add_max_as_vqslod.py' else 'add_max_as_vqs_sens.py' | ||
|
||
|
||
command <<< | ||
set -e | ||
|
||
python3 /app/add_max_as_vqslod.py ~{input_vcf} > ~{output_basename}.vcf | ||
python3 /app/~{score_tool} ~{input_vcf} > ~{output_basename}.vcf | ||
>>> | ||
runtime { | ||
docker: docker | ||
|
@@ -333,12 +345,16 @@ task EvaluateVcf { | |
|
||
String output_basename | ||
|
||
Boolean use_classic_VQSR = true | ||
|
||
String docker = "docker.io/realtimegenomics/rtg-tools:latest" | ||
Int cpu = 1 | ||
Int memory_mb = 3500 | ||
Int disk_size_gb = ceil(2 * size(ref_fasta, "GiB")) + 50 | ||
} | ||
|
||
String max_score_tag = if (use_classic_VQSR == true) then 'MAX_AS_VQSLOD' else 'MAX_AS_VQS_SENS' | ||
|
||
command <<< | ||
set -e -o pipefail | ||
|
||
|
@@ -348,7 +364,7 @@ task EvaluateVcf { | |
~{"--region " + contig} \ | ||
~{if all_records then "--all-records" else ""} \ | ||
--roc-subset snp,indel \ | ||
--vcf-score-field=INFO.MAX_AS_VQSLOD \ | ||
--vcf-score-field=INFO.~{max_score_tag} \ | ||
-t human_REF_SDF \ | ||
-b ~{truth_vcf} \ | ||
-e ~{truth_bed}\ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,8 @@ import "GvsWarpTasks.wdl" as Tasks | |
import "GvsUtils.wdl" as Utils | ||
import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite | ||
|
||
# This is a reminder to set the priors back to non-zero in GvsWarpTasks before you merge this PR!! TODO | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be done with a passed argument? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that'd be a good idea. I'm still working with Sam to make sure I run classic in the way he wants in order to remove YNG issues. So if what I've done here isn't the way (which I think is what's happening) I may remove this all - but I will try to parameterize it for future work. |
||
workflow GvsCreateFilterSet { | ||
input { | ||
Boolean go = true | ||
|
@@ -60,8 +62,8 @@ workflow GvsCreateFilterSet { | |
String fq_tranches_destination_table = "~{project_id}.~{dataset_name}.filter_set_tranches" | ||
String fq_filter_sites_destination_table = "~{project_id}.~{dataset_name}.filter_set_sites" | ||
|
||
String fq_info_destination_table_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" | ||
String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string,calibration_sensitivity:float" | ||
String fq_info_destination_table_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" | ||
String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,calibration_sensitivity:float,culprit:string,training_label:string,yng_status:string" | ||
|
||
call Utils.GetBQTableLastModifiedDatetime as SamplesTableDatetimeCheck { | ||
input: | ||
|
@@ -425,7 +427,7 @@ task ExtractFilterTask { | |
>>> | ||
|
||
runtime { | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae" | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1" | ||
memory: "7 GB" | ||
disks: "local-disk 10 HDD" | ||
bootDiskSizeGb: 15 | ||
|
@@ -454,13 +456,14 @@ task PopulateFilterSetInfo { | |
|
||
String project_id | ||
|
||
File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" | ||
File? gatk_override | ||
} | ||
meta { | ||
# Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution. | ||
} | ||
|
||
File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" | ||
|
||
command <<< | ||
set -eo pipefail | ||
|
||
|
@@ -505,7 +508,7 @@ task PopulateFilterSetInfo { | |
>>> | ||
|
||
runtime { | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae" | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1" | ||
memory: "3500 MB" | ||
disks: "local-disk 250 HDD" | ||
bootDiskSizeGb: 15 | ||
|
@@ -561,7 +564,7 @@ task PopulateFilterSetSites { | |
>>> | ||
|
||
runtime { | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae" | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1" | ||
memory: "3500 MB" | ||
disks: "local-disk 200 HDD" | ||
bootDiskSizeGb: 15 | ||
|
@@ -608,7 +611,7 @@ task PopulateFilterSetTranches { | |
>>> | ||
|
||
runtime { | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae" | ||
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1" | ||
memory: "3500 MB" | ||
disks: "local-disk 200 HDD" | ||
bootDiskSizeGb: 15 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import sys | ||
import gzip | ||
import itertools | ||
|
||
# Add new header for MAX_AS_VQS_SENS | ||
|
||
with gzip.open(sys.argv[1], 'rt') as file1: | ||
for line in file1: | ||
line = line.strip() | ||
|
||
if "##" in line: | ||
print(line) | ||
continue | ||
|
||
if "#CHROM" in line: | ||
print('##INFO=<ID=MAX_AS_VQS_SENS,Number=1,Type=Float,Description="Maximum of AS_VQS_SENS scores">') | ||
print(line) | ||
continue | ||
|
||
parts = line.split("\t") | ||
|
||
# strip out hard filtered sites, so vcfeval can use "all-records" to plot the ROC curves | ||
if ("ExcessHet" in parts[6] or "LowQual" in parts[6] or "NO_HQ_GENOTYPES" in parts[6]): | ||
continue | ||
|
||
info = parts[7] | ||
d = dict([ tuple(x.split("=")) for x in info.split(";") if "=" in x]) | ||
|
||
format_key = [x for x in parts[8].split(":")] | ||
sample_data = dict(zip(format_key, parts[9].split(":"))) | ||
|
||
gt = sample_data['GT'] | ||
|
||
if gt == "0/0" or gt == "./.": | ||
continue | ||
|
||
if 'FT' in sample_data: | ||
ft = sample_data['FT'] | ||
|
||
# if there is a non-passing FT value | ||
if not (ft == "PASS" or ft == "."): | ||
|
||
# overwrite FILTER if it was PASS or "." | ||
if (parts[6] == "PASS" or parts[6] == "."): | ||
parts[6] = ft | ||
# otherwise append it to the end | ||
else: | ||
parts[6] = parts[6] + "," + ft | ||
|
||
if "AS_VQS_SENS" not in d: | ||
sys.exit(f"Cannot find AS_VQS_SENS in {line}") | ||
else: | ||
if "," in d['AS_VQS_SENS']: | ||
pieces = [x for x in d['AS_VQS_SENS'].split(",") if (x != "." and x != "NaN") ] | ||
|
||
if (len(pieces) == 1): | ||
m = pieces[0] | ||
elif (len(pieces) == 0): | ||
m = "." | ||
else: | ||
m = max([float(x) for x in pieces]) | ||
else: | ||
m = d['AS_VQS_SENS'] | ||
|
||
parts[7] = f"{info};MAX_AS_VQS_SENS={m}" | ||
print("\t".join(parts)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry - this is temp debugging code. A stupid way I use to verify that the WDL I'm about to run in Terra has the change I just pushed. Will be removed shortly.