broadinstitute · mcovarr · May 13, 2022 · Apr 14, 2022 · May 13, 2022 · mcovarr
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -203,6 +203,14 @@ workflows:
          - master
          - ah_var_store
          - vs_357_quickstart_integration
+   - name: GvsIngestTieout
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
+     filters:
+       branches:
+         - master
+         - ah_var_store
+         - vs_261_ingest_errors
    - name: MitochondriaPipeline
      subclass: WDL
      primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -83,6 +83,7 @@ workflow GvsImportGenomes {
 
   output {
     Boolean done = true
+    Array[File] load_data_stderrs = LoadData.stderr
   }
 }
 
@@ -288,6 +289,7 @@ task LoadData {
   }
   output {
     Boolean done = true
+    File stderr = stderr()
   }
 }
 

diff --git a/scripts/variantstore/wdl/GvsIngestTieout.wdl b/scripts/variantstore/wdl/GvsIngestTieout.wdl
@@ -0,0 +1,151 @@
+version 1.0
+
+import "GvsAssignIds.wdl" as GvsAssignIds
+import "GvsImportGenomes.wdl" as GvsImportGenomes
+import "GvsUtils.wdl" as GvsUtils
+
+workflow GvsIngestTieout {
+    input {
+        String project
+        String reference_dataset_name
+        String branch_name
+        Array[String] sample_names
+        Array[File] input_vcfs
+        Array[File] input_vcf_indexes
+        String? service_account_json_path
+    }
+
+    call GvsUtils.BuildGATKJarAndCreateDataset {
+        input:
+            branch_name = branch_name,
+            dataset_prefix = "ingest_tieout"
+    }
+
+    call GvsAssignIds.GvsAssignIds  {
+        input:
+            project_id = project,
+            dataset_name = BuildGATKJarAndCreateDataset.dataset_name,
+            external_sample_names = sample_names,
+            assign_ids_gatk_override = BuildGATKJarAndCreateDataset.jar,
+            service_account_json_path = service_account_json_path
+    }
+
+    call GvsImportGenomes.GvsImportGenomes {
+        input:
+            go = GvsAssignIds.done,
+            dataset_name = BuildGATKJarAndCreateDataset.dataset_name,
+            project_id = project,
+            external_sample_names = sample_names,
+            input_vcfs = input_vcfs,
+            input_vcf_indexes = input_vcf_indexes,
+            load_data_gatk_override = BuildGATKJarAndCreateDataset.jar,
+            service_account_json_path = service_account_json_path
+    }
+
+    call IngestTieout {
+        input:
+            dataset_name = BuildGATKJarAndCreateDataset.dataset_name,
+            reference_dataset_name = reference_dataset_name,
+            project = project,
+            stderrs = GvsImportGenomes.load_data_stderrs
+    }
+}
+
+
+task IngestTieout {
+    input {
+        Boolean? go
+        String dataset_name
+        String reference_dataset_name
+        String project
+        Array[File] stderrs
+    }
+
+    parameter_meta {
+        stderrs: {
+            localization_optional: true
+        }
+    }
+
+    command <<<
+        set -o xtrace
+        fail=0
+
+        check_table() {
+            local table_name=$1
+
+            bq query --location=US --project_id=~{project} --format=csv --use_legacy_sql=false \
+                "select actual.sample_id, expected.sample_id from
+                (select sample_id, count(*) as count from \`spec-ops-aou.~{dataset_name}.${table_name}\` group by sample_id) actual full outer join
+                (select sample_id, count(*) as count from \`spec-ops-aou.~{reference_dataset_name}.${table_name}\` group by sample_id) expected on actual.sample_id = expected.sample_id
+                where actual.count != expected.count OR actual.sample_id is null OR expected.sample_id is null" > differences.txt
+
+            if [[ -s differences.txt ]]; then
+                fail=1
+                echo "${table_name} row counts are mismatched for the following samples:"
+                cat differences.txt
+            fi
+        }
+
+        # This task is currently being called with a sample set of 2000 so it only needs to check a single vet and
+        # ref_ranges table each. If this code were to be called with a sample set of more than 4000 it should test all
+        # the additional vet and ref_ranges tables that would be introduced.
+        if [[ ~{length(stderrs)} -gt 4000 ]]; then
+            echo "IngestTieout invoked with a sample set of size ~{length(stderrs)} but is currently limited to sample sets no larger than 4000."
+            exit 1
+        fi
+
+        check_table "ref_ranges_001"
+        check_table "vet_001"
+
+        mkdir logs
+        cd logs
+
+        # Get everything up to and including the /call-LoadData/ part of the GCS path.
+        gcs_prefix=$(echo ~{stderrs[0]} | sed -E 's!(.*/call-LoadData/).*!\1!')
+
+        # Recursively copy everything under this path.
+        gsutil -m cp -r ${gcs_prefix} .
+
+        # Find the stderr files in the call execution directories but not the ones in the pipelines-logs directories.
+        find call-LoadData -name pipelines-logs -prune -o -name stderr -print > stderr_fofn.txt
+        echo "Found $(wc -l stderr_fofn.txt | awk '{print $1}') stderr files in call directories"
+
+        cat stderr_fofn.txt | xargs grep StatusRuntimeException | tee exceptions.txt
+        if [[ $? -ne 0 ]]; then
+            fail=1
+            echo "Did not find any StatusRuntimeExceptions among the stderr files!"
+        else
+            grep UNAVAILABLE exceptions.txt
+            if [[ $? -ne 0 ]]; then
+                fail=1
+                echo "No UNAVAILABLE StatusRuntimeExceptions found for run!"
+            fi
+            grep -E 'ABORTED|INTERNAL|CANCELLED' exceptions.txt
+            if [[ $? -ne 0 ]]; then
+                fail=1
+                echo "No retryable StatusRuntimeExceptions (ABORTED, INTERNAL, CANCELLED) found for run!"
+            fi
+        fi
+
+        cd ..
+
+        if [[ $fail -ne 0 ]]; then
+            exit 1;
+        fi
+    >>>
+
+    output {
+        File stderr_fofn = "logs/stderr_fofn.txt"
+        File exceptions = "logs/exceptions.txt"
+        Boolean done = true
+    }
+
+    runtime {
+        docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:latest"
+        memory: "14 GB"
+        disks: "local-disk 2000 HDD"
+        preemptible: 3
+        cpu: 4
+    }
+}
diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl
@@ -1,6 +1,7 @@
 version 1.0
 
 import "GvsUnified.wdl" as GvsUnified
+import "GvsUtils.wdl" as GvsUtils
 
 workflow GvsQuickstartIntegration {
 
@@ -48,17 +49,18 @@ workflow GvsQuickstartIntegration {
                                         ]
     }
 
-    call Setup {
+    call GvsUtils.BuildGATKJarAndCreateDataset {
         input:
-            branch_name = branch_name
+            branch_name = branch_name,
+            dataset_prefix = "quickit"
     }
 
     call GvsUnified.GvsUnified {
         input:
-            dataset_name = Setup.dataset_name,
+            dataset_name = BuildGATKJarAndCreateDataset.dataset_name,
             project_id = "spec-ops-aou",
             external_sample_names = external_sample_names,
-            gatk_override = Setup.jar,
+            gatk_override = BuildGATKJarAndCreateDataset.jar,
             input_vcfs = input_vcfs,
             input_vcf_indexes = input_vcf_indexes,
             filter_set_name = "quickit",
@@ -85,63 +87,6 @@ workflow GvsQuickstartIntegration {
     }
 }
 
-task Setup {
-    input {
-        String branch_name
-    }
-
-    command <<<
-        # Much of this could/should be put into a Docker image which would be useful not only for integration test runs
-        # but also for building nightly GATK jars. This wouldn't be that valuable for this current increment of work as
-        # using a Docker image would save maybe 10 minutes from what is currently a ~4 hour workflow, but a Docker image
-        # could become more compelling if a scaled down version of this test that could run more frequently was created,
-        # in addition to the nightly build use case mentioned above.
-        set -o errexit -o nounset -o pipefail
-
-        # git and git-lfs
-        apt-get -qq update
-        apt-get -qq install git git-lfs
-
-        # Java
-        apt-get -qq install wget apt-transport-https gnupg
-        wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
-        echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list
-        apt-get -qq update
-        apt -qq install -y temurin-11-jdk
-
-        # GATK
-        git clone https://github.com/broadinstitute/gatk.git --depth 1 --branch ~{branch_name} --single-branch
-        cd gatk
-        ./gradlew shadowJar
-
-        branch=$(git symbolic-ref HEAD 2>/dev/null)
-        branch=${branch#refs/heads/}
-
-        hash=$(git rev-parse --short HEAD)
-
-        # Rename the GATK jar to embed the branch and hash of the most recent commit on the branch.
-        mv build/libs/gatk-package-unspecified-SNAPSHOT-local.jar "build/libs/gatk-${branch}-${hash}-SNAPSHOT-local.jar"
-
-        # Build a dataset name based on the branch name and the git hash of the most recent commit on this branch.
-        # Dataset names must be alphanumeric and underscores only. Convert any dashes to underscores, then delete
-        # any remaining characters that are not alphanumeric or underscores.
-        dataset="$(echo quickit_${branch}_${hash} | tr '-' '_' | tr -c -d '[:alnum:]_')"
-
-        bq mk --project_id="spec-ops-aou" "$dataset"
-
-        echo -n "$dataset" > dataset.txt
-    >>>
-
-    output {
-        File jar = glob("gatk/build/libs/*-SNAPSHOT-local.jar")[0]
-        String dataset_name = read_string("gatk/dataset.txt")
-    }
-
-    runtime {
-        docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:latest"
-        disks: "local-disk 500 HDD"
-    }
-}
 
 task AssertIdenticalOutputs {
     input {

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -238,3 +238,59 @@ task GetBQTablesMaxLastModifiedTimestamp {
     cpu: 1
   }
 }
+
+task BuildGATKJarAndCreateDataset {
+  input {
+    String branch_name
+    String dataset_prefix
+  }
+
+  command <<<
+    # Much of this could/should be put into a Docker image.
+    set -o errexit -o nounset -o pipefail
+
+    # git and git-lfs
+    apt-get -qq update
+    apt-get -qq install git git-lfs
+
+    # Java
+    apt-get -qq install wget apt-transport-https gnupg
+    wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
+    echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list
+    apt-get -qq update
+    apt -qq install -y temurin-11-jdk
+
+    # GATK
+    git clone https://github.com/broadinstitute/gatk.git --depth 1 --branch ~{branch_name} --single-branch
+    cd gatk
+    ./gradlew shadowJar
+
+    branch=$(git symbolic-ref HEAD 2>/dev/null)
+    branch=${branch#refs/heads/}
+
+    hash=$(git rev-parse --short HEAD)
+
+    # Rename the GATK jar to embed the branch and hash of the most recent commit on the branch.
+    mv build/libs/gatk-package-unspecified-SNAPSHOT-local.jar "build/libs/gatk-${branch}-${hash}-SNAPSHOT-local.jar"
+
+    # Build a dataset name based on the branch name and the git hash of the most recent commit on this branch.
+    # Dataset names must be alphanumeric and underscores only. Convert any dashes to underscores, then delete
+    # any remaining characters that are not alphanumeric or underscores.
+    dataset="$(echo ~{dataset_prefix}_${branch}_${hash} | tr '-' '_' | tr -c -d '[:alnum:]_')"
+
+    bq mk --project_id="spec-ops-aou" "$dataset"
+
+    echo -n "$dataset" > dataset.txt
+  >>>
+
+  output {
+    Boolean done = true
+    File jar = glob("gatk/build/libs/*-SNAPSHOT-local.jar")[0]
+    String dataset_name = read_string("gatk/dataset.txt")
+  }
+
+  runtime {
+    docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:latest"
+    disks: "local-disk 500 HDD"
+  }
+}