broadinstitute · gbggrant · Mar 1, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 17, 2023
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -95,6 +95,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - gg_VS-815_PullThroughYNGInVQSRLite
    - name: GvsPopulateAltAllele
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
@@ -118,6 +119,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - gg_VS-815_PullThroughYNGInVQSRLite
    - name: GvsImportGenomes
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -202,14 +204,14 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - gg_VS-815_PullThroughYNGInVQSRLite
    - name: GvsQuickstartVcfIntegration
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl
      filters:
        branches:
          - master
          - ah_var_store
-         - gg_ConsistentlyUseDatasetNameAsInputParameter
    - name: GvsQuickstartHailIntegration
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
@@ -218,7 +220,6 @@ workflows:
          - master
          - ah_var_store
          - vs_639_hail_testing_spike
-         - gg_ConsistentlyUseDatasetNameAsInputParameter
    - name: GvsQuickstartIntegration
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
@@ -227,7 +228,6 @@ workflows:
          - master
          - ah_var_store
          - vs_639_hail_testing_spike
-         - gg_ConsistentlyUseDatasetNameAsInputParameter
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl

diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl
@@ -1,5 +1,7 @@
 version 1.0
 
+# I am a comment.
+
 workflow GvsCalculatePrecisionAndSensitivity {
   input {
     Array[File] input_vcfs
@@ -13,6 +15,8 @@ workflow GvsCalculatePrecisionAndSensitivity {
     Array[File] truth_beds
 
     File ref_fasta
+
+    Boolean use_classic_VQSR = true
   }
 
   parameter_meta {
@@ -24,6 +28,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
     truth_vcf_indices: "A list of the VCF indices for the truth data VCFs supplied above."
     truth_beds: "A list of the bed files for the truth data used for analyzing the samples in `sample_names`."
     ref_fasta: "The cloud path for the reference fasta sequence."
+    use_classic_VQSR: "Flag to indicate whether the input VCFs were generated using classic (standard) VQSR or 'VQSR Lite'."
   }
 
   # WDL 1.0 trick to set a variable ('none') to be undefined.
@@ -75,15 +80,16 @@ workflow GvsCalculatePrecisionAndSensitivity {
         output_basename = output_sample_basename
     }
 
-    call Add_AS_MAX_VQSLOD_ToVcf {
+    call Add_AS_MAX_VQS_Score_ToVcf {
       input:
         input_vcf = SelectVariants.output_vcf,
-        output_basename = output_sample_basename + ".maxas"
+        output_basename = output_sample_basename + ".maxas",
+        use_classic_VQSR = use_classic_VQSR
     }
 
     call BgzipAndTabix {
       input:
-        input_vcf = Add_AS_MAX_VQSLOD_ToVcf.output_vcf,
+        input_vcf = Add_AS_MAX_VQS_Score_ToVcf.output_vcf,
         output_basename = output_sample_basename + ".maxas"
     }
 
@@ -96,6 +102,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
         truth_bed = truth_beds[i],
         contig = contig,
         output_basename = sample_name + "-bq_roc_filtered",
+        use_classic_VQSR = use_classic_VQSR,
         ref_fasta = ref_fasta
     }
 
@@ -109,6 +116,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
         contig = contig,
         all_records = true,
         output_basename = sample_name + "-bq_all",
+        use_classic_VQSR = use_classic_VQSR,
         ref_fasta = ref_fasta
     }
   }
@@ -259,21 +267,25 @@ task SelectVariants {
   }
 }
 
-task Add_AS_MAX_VQSLOD_ToVcf {
+task Add_AS_MAX_VQS_Score_ToVcf {
   input {
     File input_vcf
     String output_basename
+    Boolean use_classic_VQSR
 
-    String docker = "us.gcr.io/broad-dsde-methods/variantstore:2023-01-23-alpine"
+    String docker = "us.gcr.io/broad-dsde-methods/variantstore:2023-02-21-alpine"
     Int cpu = 1
     Int memory_mb = 3500
     Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50
   }
 
+  String score_tool = if (use_classic_VQSR == true) then 'add_max_as_vqslod.py' else 'add_max_as_vqs_sens.py'
+
+
   command <<<
     set -e
 
-    python3 /app/add_max_as_vqslod.py ~{input_vcf} > ~{output_basename}.vcf
+    python3 /app/~{score_tool} ~{input_vcf} > ~{output_basename}.vcf
   >>>
   runtime {
     docker: docker
@@ -333,12 +345,16 @@ task EvaluateVcf {
 
     String output_basename
 
+    Boolean use_classic_VQSR = true
+
     String docker = "docker.io/realtimegenomics/rtg-tools:latest"
     Int cpu = 1
     Int memory_mb = 3500
     Int disk_size_gb = ceil(2 * size(ref_fasta, "GiB")) + 50
   }
 
+  String max_score_tag = if (use_classic_VQSR == true) then 'MAX_AS_VQSLOD' else 'MAX_AS_VQS_SENS'
+
   command <<<
     set -e -o pipefail
 
@@ -348,7 +364,7 @@ task EvaluateVcf {
       ~{"--region " + contig} \
       ~{if all_records then "--all-records" else ""} \
       --roc-subset snp,indel \
-      --vcf-score-field=INFO.MAX_AS_VQSLOD \
+      --vcf-score-field=INFO.~{max_score_tag} \
       -t human_REF_SDF \
       -b ~{truth_vcf} \
       -e ~{truth_bed}\

diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl
@@ -4,6 +4,8 @@ import "GvsWarpTasks.wdl" as Tasks
 import "GvsUtils.wdl" as Utils
 import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite
 
+# This is a reminder to set the priors back to non-zero in GvsWarpTasks before you merge this PR!!  TODO
+
 workflow GvsCreateFilterSet {
   input {
     Boolean go = true
@@ -60,8 +62,8 @@ workflow GvsCreateFilterSet {
   String fq_tranches_destination_table = "~{project_id}.~{dataset_name}.filter_set_tranches"
   String fq_filter_sites_destination_table = "~{project_id}.~{dataset_name}.filter_set_sites"
 
-  String fq_info_destination_table_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string"
-  String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string,calibration_sensitivity:float"
+  String fq_info_destination_table_schema =           "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string"
+  String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,calibration_sensitivity:float,culprit:string,training_label:string,yng_status:string"
 
   call Utils.GetBQTableLastModifiedDatetime as SamplesTableDatetimeCheck {
     input:
@@ -425,7 +427,7 @@ task ExtractFilterTask {
   >>>
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
+    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1"
     memory: "7 GB"
     disks: "local-disk 10 HDD"
     bootDiskSizeGb: 15
@@ -454,13 +456,14 @@ task PopulateFilterSetInfo {
 
     String project_id
 
-    File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
     File? gatk_override
   }
   meta {
     # Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution.
   }
 
+  File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
+
   command <<<
     set -eo pipefail
 
@@ -505,7 +508,7 @@ task PopulateFilterSetInfo {
   >>>
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
+    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1"
     memory: "3500 MB"
     disks: "local-disk 250 HDD"
     bootDiskSizeGb: 15
@@ -561,7 +564,7 @@ task PopulateFilterSetSites {
   >>>
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
+    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1"
     memory: "3500 MB"
     disks: "local-disk 200 HDD"
     bootDiskSizeGb: 15
@@ -608,7 +611,7 @@ task PopulateFilterSetTranches {
   >>>
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
+    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1"
     memory: "3500 MB"
     disks: "local-disk 200 HDD"
     bootDiskSizeGb: 15

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -348,7 +348,7 @@ task ExtractTask {
     echo ~{interval_index},${OUTPUT_FILE_DEST},${OUTPUT_FILE_BYTES},${OUTPUT_FILE_INDEX_DEST},${OUTPUT_FILE_INDEX_BYTES} >> manifest.txt
   >>>
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
+    docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_7274e012706cb2fa15ed3fb1e12d7e9ae28aa4a1"
     memory: "12 GB"
     disks: "local-disk 150 HDD"
     bootDiskSizeGb: 15

diff --git a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
@@ -111,11 +111,10 @@ task PrepareRangesCallsetTask {
   >>>
   output {
     String fq_cohort_extract_table_prefix = "~{fq_destination_dataset}.~{destination_cohort_table_prefix}" # implementation detail of create_ranges_cohort_extract_data_table.py
-    String fq_cohort_extract_table_prefix = "~{fq_destination_dataset}.~{destination_cohort_table_prefix}" # implementation detail of create_ranges_cohort_extract_data_table.py
   }
 
   runtime {
-    docker: "us.gcr.io/broad-dsde-methods/variantstore:2023-01-23-alpine"
+    docker: "us.gcr.io/broad-dsde-methods/variantstore:2023-02-21-alpine"
     memory: "3 GB"
     disks: "local-disk 100 HDD"
     bootDiskSizeGb: 15

diff --git a/scripts/variantstore/wdl/GvsWarpTasks.wdl b/scripts/variantstore/wdl/GvsWarpTasks.wdl
@@ -53,12 +53,15 @@ task SNPsVariantRecalibratorCreateModel {
         --maximum-training-variants ~{maximum_training_variants} \
         --output-model ~{model_report_filename} \
         --max-gaussians ~{max_gaussians} \
-        -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \
-        -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \
-        -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \
-        -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf}
+        -resource:hapmap,known=false,training=true,truth=true,prior=0 ~{hapmap_resource_vcf} \
+        -resource:omni,known=false,training=true,truth=true,prior=0 ~{omni_resource_vcf} \
+        -resource:1000G,known=false,training=true,truth=false,prior=0 ~{one_thousand_genomes_resource_vcf} \
+        -resource:dbsnp,known=true,training=false,truth=false,prior=0 ~{dbsnp_resource_vcf}
     >>>
 
+    ## TODO ## You have set the priors above to 0 for doing precision and sensitivity
+    ## YOU MUST UNDO THIS PRIOR TO MERGING TO ah_var_store
+
     runtime {
         memory: "~{machine_mem} GiB"
         cpu: "2"
@@ -189,11 +192,14 @@ task IndelsVariantRecalibrator {
         ~{"--input-model " + model_report} \
         --max-gaussians ~{max_gaussians} \
         --maximum-training-variants ~{maximum_training_variants} \
-        -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \
-        -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \
-        -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf}
+        -resource:mills,known=false,training=true,truth=true,prior=0 ~{mills_resource_vcf} \
+        -resource:axiomPoly,known=false,training=true,truth=false,prior=0 ~{axiomPoly_resource_vcf} \
+        -resource:dbsnp,known=true,training=false,truth=false,prior=0 ~{dbsnp_resource_vcf}
     >>>
 
+    ## TODO ## You have set the priors above to 0 for doing precision and sensitivity
+    ## YOU MUST UNDO THIS PRIOR TO MERGING TO ah_var_store
+
     runtime {
         memory: "~{machine_mem} GiB"
         cpu: "2"
@@ -268,12 +274,15 @@ task SNPsVariantRecalibrator {
         --sample-every-Nth-variant 1 \
         ~{model_report_arg} \
         --max-gaussians ~{max_gaussians} \
-        -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \
-        -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \
-        -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \
-        -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf}
+        -resource:hapmap,known=false,training=true,truth=true,prior=0 ~{hapmap_resource_vcf} \
+        -resource:omni,known=false,training=true,truth=true,prior=0 ~{omni_resource_vcf} \
+        -resource:1000G,known=false,training=true,truth=false,prior=0 ~{one_thousand_genomes_resource_vcf} \
+        -resource:dbsnp,known=true,training=false,truth=false,prior=0 ~{dbsnp_resource_vcf}
     >>>
 
+    ## TODO ## You have set the priors above to 0 for doing precision and sensitivity
+    ## YOU MUST UNDO THIS PRIOR TO MERGING TO ah_var_store
+
     runtime {
         memory: "~{machine_mem} GiB"
         cpu: 2

diff --git a/scripts/variantstore/wdl/extract/add_max_as_vqs_sens.py b/scripts/variantstore/wdl/extract/add_max_as_vqs_sens.py
@@ -0,0 +1,66 @@
+import sys
+import gzip
+import itertools
+
+# Add new header for MAX_AS_VQS_SENS
+
+with gzip.open(sys.argv[1], 'rt') as file1:
+    for line in file1:
+        line = line.strip()
+
+        if "##" in line:
+            print(line)
+            continue
+
+        if "#CHROM" in line:
+            print('##INFO=<ID=MAX_AS_VQS_SENS,Number=1,Type=Float,Description="Maximum of AS_VQS_SENS scores">')
+            print(line)
+            continue
+
+        parts = line.split("\t")
+
+        # strip out hard filtered sites, so vcfeval can use "all-records" to plot the ROC curves
+        if ("ExcessHet" in parts[6] or "LowQual" in parts[6] or "NO_HQ_GENOTYPES" in parts[6]):
+            continue
+
+        info = parts[7]    
+        d = dict([ tuple(x.split("=")) for x in info.split(";") if "=" in x]) 
+
+        format_key = [x for x in parts[8].split(":")]
+        sample_data = dict(zip(format_key, parts[9].split(":")))
+
+        gt = sample_data['GT']
+
+        if gt == "0/0" or gt == "./.":
+            continue
+
+        if 'FT' in sample_data:
+            ft = sample_data['FT']
+
+            # if there is a non-passing FT value
+            if not (ft == "PASS" or ft == "."):
+
+                # overwrite FILTER if it was PASS or "."
+                if (parts[6] == "PASS" or parts[6] == "."):
+                    parts[6] = ft
+                # otherwise append it to the end
+                else:
+                    parts[6] = parts[6] + "," + ft
+
+        if "AS_VQS_SENS" not in d:
+            sys.exit(f"Cannot find AS_VQS_SENS in {line}")
+        else:
+            if "," in d['AS_VQS_SENS']:
+                pieces = [x for x in d['AS_VQS_SENS'].split(",") if (x != "." and x != "NaN") ]
+
+                if (len(pieces) == 1):
+                    m = pieces[0]
+                elif (len(pieces) == 0):
+                    m = "."
+                else:
+                    m = max([float(x) for x in pieces])
+            else:
+                m = d['AS_VQS_SENS']
+
+        parts[7] = f"{info};MAX_AS_VQS_SENS={m}"
+        print("\t".join(parts))