broadinstitute · rsasch · Aug 24, 2022 · Aug 23, 2022 · Aug 23, 2022 · Aug 23, 2022
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -119,14 +119,15 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - vs_447_fixup_non_fq_invocations
+         - rsa_vs_607_drop_state
    - name: GvsImportGenomes
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsImportGenomes.wdl
      filters:
        branches:
          - master
          - ah_var_store
+         - rsa_vs_607_drop_state
    - name: GvsPrepareRangesCallset
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
@@ -168,7 +169,7 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - kc_variant_search_extract_wdl
+         - rsa_vs_607_drop_state
    - name: GvsWithdrawSamples
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsWithdrawSamples.wdl
@@ -183,6 +184,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rsa_vs_607_drop_state
    - name: GvsJointVariantCalling
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -212,6 +214,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rsa_vs_607_drop_state
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl

diff --git a/scripts/variantstore/AOU_DELIVERABLES.md b/scripts/variantstore/AOU_DELIVERABLES.md
@@ -56,7 +56,7 @@
    - It will need to be run twice, once with `control_samples` set to "true" (see [naming conventions doc](https://docs.google.com/document/d/1pNtuv7uDoiOFPbwe4zx5sAGH7MyxwKqXkyrpNmBxeow) for guidance on what to use for `extract_table_prefix` or cohort prefix, which you will need to keep track of for the `GvsExtractCallset` WDL); the default value is `false`.
    - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
 8. `GvsExtractCallset` workflow
-   - This workflow extracts the data in BigQuery and transforms it into a sharded joint called VCF incorporating the VQSR filter set data.
+   - This workflow extracts the data in BigQuery and transforms it into a sharded joint called VCF incorporating the VQSR filter set data.  We will probably not run this on callsets of more than 100K samples.
    - It also needs to be run twice, once with `control_samples` set to "true", and with the `filter_set_name` and `extract_table_prefix` from step 5 & 6.  Include a valid (and secure) "output_gcs_dir" parameter, which is where the VCF, interval list, manifest, and sample name list files will go.
    - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
 9. **TBD VDS Prepare WDL/notebook/??**

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -20,6 +20,9 @@ workflow GvsExtractCallset {
     Int? scatter_count
     Boolean zero_pad_output_vcf_filenames = true
 
+    # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
+    String drop_state = "NONE"
+
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"
     File gatk_override = "gs://gvs_quickstart_storage/jars/gatk-package-4.2.0.0-552-g0f9780a-SNAPSHOT-local.jar"
@@ -154,7 +157,7 @@ workflow GvsExtractCallset {
         fq_filter_set_tranches_table       = fq_filter_set_tranches_table,
         filter_set_name                    = filter_set_name,
         filter_set_name_verified           = select_first([ValidateFilterSetName.done, "done"]),
-        drop_state                         = "FORTY",
+        drop_state                         = drop_state,
         output_file                        = vcf_filename,
         output_gcs_dir                     = output_gcs_dir,
         max_last_modified_timestamp        = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
@@ -386,7 +389,7 @@ task SumBytes {
 
   command <<<
     set -e
-    echo "~{sep=" " file_sizes_bytes}" | tr " " "\n" | python -c "
+    echo "~{sep=" " file_sizes_bytes}" | tr " " "\n" | python3 -c "
     import sys;
     total_bytes = sum(float(i.strip()) for i in sys.stdin);
     total_mb = total_bytes/10**6;

diff --git a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
@@ -28,6 +28,8 @@ workflow GvsExtractCohortFromSampleNames {
     Int scatter_count
 
     String? output_gcs_dir
+    # set to "NONE" if all the reference data was loaded into GVS in GvsImportGenomes
+    String drop_state = "NONE"
 
     Int? extract_preemptible_override
     Int? extract_maxretries_override
@@ -79,6 +81,7 @@ workflow GvsExtractCohortFromSampleNames {
       output_file_base_name = output_file_base_name,
       output_gcs_dir = output_gcs_dir,
 
+      drop_state = drop_state,
       extract_preemptible_override = extract_preemptible_override,
       extract_maxretries_override = extract_maxretries_override,
       split_intervals_disk_size_override = split_intervals_disk_size_override,

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -15,6 +15,9 @@ workflow GvsImportGenomes {
 
     Boolean skip_loading_vqsr_fields = false
 
+    # set to "NONE" to ingest all the reference data into GVS for VDS (instead of VCF) output
+    String drop_state = "NONE"
+
     File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     Int? load_data_batch_size
     Int? load_data_preemptible_override
@@ -94,7 +97,7 @@ workflow GvsImportGenomes {
         dataset_name = dataset_name,
         project_id = project_id,
         skip_loading_vqsr_fields = skip_loading_vqsr_fields,
-        drop_state = "FORTY",
+        drop_state = drop_state,
         drop_state_includes_greater_than = false,
         input_vcf_indexes = read_lines(CreateFOFNs.vcf_batch_vcf_index_fofns[i]),
         input_vcfs = read_lines(CreateFOFNs.vcf_batch_vcf_fofns[i]),

diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
@@ -49,6 +49,7 @@ workflow GvsQuickstartIntegration {
                                         ]
 
         Int? extract_scatter_count
+        String drop_state = "NONE"
     }
     String project_id = "gvs-internal"
 
@@ -73,6 +74,7 @@ workflow GvsQuickstartIntegration {
             # Force filtering off as it is not deterministic and the initial version of this integration test does not
             # allow for inexact matching of actual and expected results.
             extract_do_not_filter_override = true,
+            drop_state = drop_state
     }
 
     call AssertIdenticalOutputs {

diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl
@@ -25,6 +25,8 @@ workflow GvsUnified {
         Array[File] input_vcf_indexes
         File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
 
+        # set to "NONE" to ingest all the reference data into GVS for VDS (instead of VCF) output
+        String drop_state = "NONE"
 
         # The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable
         # BigQuery errors so if specifying this adjust preemptible and maxretries accordingly. Or just take the defaults,
@@ -93,7 +95,8 @@ workflow GvsUnified {
             load_data_preemptible_override = load_data_preemptible_override,
             load_data_maxretries_override = load_data_maxretries_override,
             load_data_gatk_override = gatk_override,
-            load_data_batch_size = load_data_batch_size
+            load_data_batch_size = load_data_batch_size,
+            drop_state = drop_state
     }
 
     call CreateAltAllele.GvsPopulateAltAllele {
@@ -155,7 +158,8 @@ workflow GvsUnified {
             output_gcs_dir = extract_output_gcs_dir,
             split_intervals_disk_size_override = split_intervals_disk_size_override,
             split_intervals_mem_override = split_intervals_mem_override,
-            do_not_filter_override = extract_do_not_filter_override
+            do_not_filter_override = extract_do_not_filter_override,
+            drop_state = drop_state
     }
 
     output {