fix the check for duplicates in import genomes (#7470)

* new query to detect duplicates * allow more than 100 samples to be returne * document and use vet instead of pet for query
broadinstitute · Sep 17, 2021 · 185b5f4 · 185b5f4
1 parent 44c297a
commit 185b5f4
Showing 1 changed file with 21 additions and 17 deletions.
diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -344,6 +344,7 @@ task CheckForDuplicateData {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  Int num_samples = length(sample_names)
 
 
   command <<<
@@ -368,20 +369,23 @@ task CheckForDuplicateData {
     fi
 
     INFO_SCHEMA_TABLE="~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS"
-    touch duplicates
-    cat ~{write_lines(sample_names)} | sort > sorted_names.txt
+    TEMP_TABLE="~{dataset_name}.sample_dupe_check"
+    SAMPLE_INFO_TABLE="~{dataset_name}.sample_info"
 
-    # Check that the table has been created yet
-    set +e
-    bq show --project_id ~{project_id} $TABLE > /dev/null
-    BQ_SHOW_RC=$?
-    set -e
-    if [ $BQ_SHOW_RC -eq 0 ]; then
-      bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
-        "SELECT s.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN ~{dataset_name}.sample_info s ON (p.partition_id = CAST(s.sample_id AS STRING)) WHERE table_name like 'pet_%'" | \
-        sed -e '/sample_name/d' | sort > bq_sorted_names.txt
-      comm -12 sorted_names.txt bq_sorted_names.txt > duplicates
-    fi
+    # create a temp table with the sample_names
+    bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
+    NAMES_FILE=~{write_lines(sample_names)}
+    bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"
+
+    # check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
+    # this returns the list of sample names that do already have data loaded
+    bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false \
+      "WITH items as (SELECT s.sample_id, s.sample_name FROM ${TEMP_TABLE} t left outer join ${SAMPLE_INFO_TABLE} s on (s.sample_name = t.sample_name)) " \
+      "SELECT i.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND table_name like 'pet_%'" | \
+      sed -e '/sample_name/d' > duplicates
+
+      # remove the temp table
+      bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}
 
     # true if there is data in results
     if [ -s duplicates ]; then
@@ -400,6 +404,7 @@ task CheckForDuplicateData {
   }
   output {
       Boolean done = true
+      File? duplicates = "duplicates"
   }
 }
 
@@ -519,7 +524,7 @@ task CreateImportTsvs {
           if [ ~{call_cache_tsvs} = 'true' ]; then
             echo "Checking for files to call cache"
 
-            declare -a TABLETYPES=("sample_info" "pet" "vet")
+            declare -a TABLETYPES=("pet" "vet")
             ALL_FILES_EXIST='true'
             for TABLETYPE in ${TABLETYPES[@]}; do
                 FILEPATH="~{output_directory}/${TABLETYPE}_tsvs/**${TABLETYPE}_*_${input_vcf_basename}.tsv"
@@ -562,7 +567,6 @@ task CreateImportTsvs {
                 -SNM ~{sample_map} \
                 --ref-version 38
 
-              gsutil -m mv sample_info_*.tsv ~{output_directory}/sample_info_tsvs/
               gsutil -m mv pet_*.tsv ~{output_directory}/pet_tsvs/
               gsutil -m mv vet_*.tsv ~{output_directory}/vet_tsvs/
           fi
@@ -846,9 +850,9 @@ task SetIsLoadedColumn {
 
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
-    # set is_loaded to true if there is a corresponding pet table partition with rows for that sample_id
+    # set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
     bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
-    "UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"pet_%\")"
+    "UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"vet_%\")"
   >>>
 
   runtime {