Skip to content

Commit

Permalink
fix the check for duplicates in import genomes (#7470)
Browse files Browse the repository at this point in the history
* new query to detect duplicates
* allow more than 100 samples to be returne
* document and use vet instead of pet for query
  • Loading branch information
ahaessly authored Sep 17, 2021
1 parent 44c297a commit 185b5f4
Showing 1 changed file with 21 additions and 17 deletions.
38 changes: 21 additions & 17 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ task CheckForDuplicateData {
}
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
Int num_samples = length(sample_names)
command <<<
Expand All @@ -368,20 +369,23 @@ task CheckForDuplicateData {
fi
INFO_SCHEMA_TABLE="~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS"
touch duplicates
cat ~{write_lines(sample_names)} | sort > sorted_names.txt
TEMP_TABLE="~{dataset_name}.sample_dupe_check"
SAMPLE_INFO_TABLE="~{dataset_name}.sample_info"
# Check that the table has been created yet
set +e
bq show --project_id ~{project_id} $TABLE > /dev/null
BQ_SHOW_RC=$?
set -e
if [ $BQ_SHOW_RC -eq 0 ]; then
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"SELECT s.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN ~{dataset_name}.sample_info s ON (p.partition_id = CAST(s.sample_id AS STRING)) WHERE table_name like 'pet_%'" | \
sed -e '/sample_name/d' | sort > bq_sorted_names.txt
comm -12 sorted_names.txt bq_sorted_names.txt > duplicates
fi
# create a temp table with the sample_names
bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
NAMES_FILE=~{write_lines(sample_names)}
bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"
# check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
# this returns the list of sample names that do already have data loaded
bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false \
"WITH items as (SELECT s.sample_id, s.sample_name FROM ${TEMP_TABLE} t left outer join ${SAMPLE_INFO_TABLE} s on (s.sample_name = t.sample_name)) " \
"SELECT i.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND table_name like 'pet_%'" | \
sed -e '/sample_name/d' > duplicates
# remove the temp table
bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}
# true if there is data in results
if [ -s duplicates ]; then
Expand All @@ -400,6 +404,7 @@ task CheckForDuplicateData {
}
output {
Boolean done = true
File? duplicates = "duplicates"
}
}

Expand Down Expand Up @@ -519,7 +524,7 @@ task CreateImportTsvs {
if [ ~{call_cache_tsvs} = 'true' ]; then
echo "Checking for files to call cache"

declare -a TABLETYPES=("sample_info" "pet" "vet")
declare -a TABLETYPES=("pet" "vet")
ALL_FILES_EXIST='true'
for TABLETYPE in ${TABLETYPES[@]}; do
FILEPATH="~{output_directory}/${TABLETYPE}_tsvs/**${TABLETYPE}_*_${input_vcf_basename}.tsv"
Expand Down Expand Up @@ -562,7 +567,6 @@ task CreateImportTsvs {
-SNM ~{sample_map} \
--ref-version 38

gsutil -m mv sample_info_*.tsv ~{output_directory}/sample_info_tsvs/
gsutil -m mv pet_*.tsv ~{output_directory}/pet_tsvs/
gsutil -m mv vet_*.tsv ~{output_directory}/vet_tsvs/
fi
Expand Down Expand Up @@ -846,9 +850,9 @@ task SetIsLoadedColumn {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# set is_loaded to true if there is a corresponding pet table partition with rows for that sample_id
# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"pet_%\")"
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"vet_%\")"
>>>

runtime {
Expand Down

0 comments on commit 185b5f4

Please sign in to comment.