Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix the check for duplicates in import genomes #7470

Merged
merged 6 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ workflows:
branches:
- master
- ah_var_store
- ah_fix_dupes
- name: GvsPrepareCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl
Expand Down
38 changes: 21 additions & 17 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ task CheckForDuplicateData {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
Int num_samples = length(sample_names)


command <<<
Expand All @@ -368,20 +369,23 @@ task CheckForDuplicateData {
fi

INFO_SCHEMA_TABLE="~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS"
touch duplicates
cat ~{write_lines(sample_names)} | sort > sorted_names.txt
TEMP_TABLE="~{dataset_name}.sample_dupe_check"
SAMPLE_INFO_TABLE="~{dataset_name}.sample_info"

# Check that the table has been created yet
set +e
bq show --project_id ~{project_id} $TABLE > /dev/null
BQ_SHOW_RC=$?
set -e
if [ $BQ_SHOW_RC -eq 0 ]; then
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"SELECT s.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN ~{dataset_name}.sample_info s ON (p.partition_id = CAST(s.sample_id AS STRING)) WHERE table_name like 'pet_%'" | \
sed -e '/sample_name/d' | sort > bq_sorted_names.txt
comm -12 sorted_names.txt bq_sorted_names.txt > duplicates
fi
# create a temp table with the sample_names
bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
NAMES_FILE=~{write_lines(sample_names)}
bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"

# check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
# this returns the list of sample names that do already have data loaded
bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a comment here would be good saying how this works? My take (to see if I got it right) is "Check to see if data has been loaded for any of the provided sample names"?

"WITH items as (SELECT s.sample_id, s.sample_name FROM ${TEMP_TABLE} t left outer join ${SAMPLE_INFO_TABLE} s on (s.sample_name = t.sample_name)) " \
"SELECT i.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND table_name like 'pet_%'" | \
ahaessly marked this conversation as resolved.
Show resolved Hide resolved
sed -e '/sample_name/d' > duplicates

# remove the temp table
bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}

# true if there is data in results
if [ -s duplicates ]; then
Expand All @@ -400,6 +404,7 @@ task CheckForDuplicateData {
}
output {
Boolean done = true
File? duplicates = "duplicates"
}
}

Expand Down Expand Up @@ -519,7 +524,7 @@ task CreateImportTsvs {
if [ ~{call_cache_tsvs} = 'true' ]; then
echo "Checking for files to call cache"

declare -a TABLETYPES=("sample_info" "pet" "vet")
declare -a TABLETYPES=("pet" "vet")
ALL_FILES_EXIST='true'
for TABLETYPE in ${TABLETYPES[@]}; do
FILEPATH="~{output_directory}/${TABLETYPE}_tsvs/**${TABLETYPE}_*_${input_vcf_basename}.tsv"
Expand Down Expand Up @@ -562,7 +567,6 @@ task CreateImportTsvs {
-SNM ~{sample_map} \
--ref-version 38

gsutil -m mv sample_info_*.tsv ~{output_directory}/sample_info_tsvs/
gsutil -m mv pet_*.tsv ~{output_directory}/pet_tsvs/
gsutil -m mv vet_*.tsv ~{output_directory}/vet_tsvs/
fi
Expand Down Expand Up @@ -846,9 +850,9 @@ task SetIsLoadedColumn {

echo "project_id = ~{project_id}" > ~/.bigqueryrc

# set is_loaded to true if there is a corresponding pet table partition with rows for that sample_id
# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"pet_%\")"
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id in ('~{sep="\',\'" gvs_id_array}') AND total_logical_bytes > 0 AND table_name LIKE \"vet_%\")"
>>>

runtime {
Expand Down