Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-694 - Extract Callset for VQSR Lite #8182

Merged
merged 18 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ workflows:
branches:
- master
- ah_var_store
- VS-693_VQSR_lite
- name: GvsPopulateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
Expand Down Expand Up @@ -151,7 +150,6 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-785_RegenerateTheVATTsv
- name: GvsValidateVat
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ workflow GvsValidateVat {

call Utils.GetBQTableLastModifiedDatetime {
input:
query_project = project_id,
project_id = project_id,
fq_table = fq_vat_table
}

Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCallsetStatistics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ workflow GvsCallsetStatistics {

call Utils.ValidateFilterSetName {
input:
data_project = project_id,
dataset_name = dataset_name,
project_id = project_id,
fq_filter_set_info_table = "~{project_id}.~{dataset_name}.filter_set_info",
filter_set_name = filter_set_name
}

Expand Down
79 changes: 51 additions & 28 deletions scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ workflow GvsCreateFilterSet {
String fq_sample_table = "~{project_id}.~{dataset_name}.sample_info"
String fq_alt_allele_table = "~{project_id}.~{dataset_name}.alt_allele"
String fq_info_destination_table = "~{project_id}.~{dataset_name}.filter_set_info"
String fq_info_destination_table_vqsr_lite = "~{project_id}.~{dataset_name}.vqsr_lite_filter_set_info"
String fq_info_destination_table_vqsr_lite = "~{project_id}.~{dataset_name}.filter_set_info_vqsr_lite"
String fq_tranches_destination_table = "~{project_id}.~{dataset_name}.filter_set_tranches"
String fq_filter_sites_destination_table = "~{project_id}.~{dataset_name}.filter_set_sites"

Expand All @@ -65,7 +65,7 @@ workflow GvsCreateFilterSet {

call Utils.GetBQTableLastModifiedDatetime as SamplesTableDatetimeCheck {
input:
query_project = project_id,
project_id = project_id,
fq_table = fq_sample_table
}

Expand Down Expand Up @@ -93,7 +93,7 @@ workflow GvsCreateFilterSet {

call Utils.GetBQTableLastModifiedDatetime as AltAlleleTableDatetimeCheck {
input:
query_project = project_id,
project_id = project_id,
fq_table = fq_alt_allele_table
}

Expand All @@ -111,7 +111,7 @@ workflow GvsCreateFilterSet {
alt_allele_table_timestamp = AltAlleleTableDatetimeCheck.last_modified_timestamp,
excess_alleles_threshold = 1000000,
output_file = "${filter_set_name}_${i}.vcf.gz",
query_project = project_id,
project_id = project_id,
dataset_id = dataset_name,
call_set_identifier = call_set_identifier
}
Expand Down Expand Up @@ -158,17 +158,40 @@ workflow GvsCreateFilterSet {
preemptible_tries = 3,
}

call PopulateFilterSetInfo {
# These calls to SelectVariants are being added for two reasons
# 1) The snps_variant_scored_vcf and indels_variant_scored_vcf output by JointVcfFiltering contains ALL variants,
# but are currently ONLY annotating SNPs and INDELs respectively.
# 2) Those output VCFs also contain filtered sites (sites at which the FILTER field set to anything other than '.' or 'PASS')
# which we don't want to put into the filter_set_info_vqsr_lite table.
call Utils.SelectVariants as CreateFilteredScoredSNPsVCF {
input:
input_vcf = MergeSNPScoredVCFs.output_vcf,
input_vcf_index = MergeSNPScoredVCFs.output_vcf_index,
type_to_include = "SNP",
exclude_filtered = true,
output_basename = "${filter_set_name}.filtered.scored.snps"
}

call Utils.SelectVariants as CreateFilteredScoredINDELsVCF {
input:
input_vcf = MergeINDELScoredVCFs.output_vcf,
input_vcf_index = MergeINDELScoredVCFs.output_vcf_index,
type_to_include = "INDEL",
exclude_filtered = true,
output_basename = "${filter_set_name}.filtered.scored.indels"
}

call PopulateFilterSetInfo {
input:
gatk_override = gatk_override,
filter_set_name = filter_set_name,
snp_recal_file = MergeSNPScoredVCFs.output_vcf,
snp_recal_file_index = MergeSNPScoredVCFs.output_vcf_index,
indel_recal_file = MergeINDELScoredVCFs.output_vcf,
indel_recal_file_index = MergeINDELScoredVCFs.output_vcf_index,
snp_recal_file = CreateFilteredScoredSNPsVCF.output_vcf,
snp_recal_file_index = CreateFilteredScoredSNPsVCF.output_vcf_index,
indel_recal_file = CreateFilteredScoredINDELsVCF.output_vcf,
indel_recal_file_index = CreateFilteredScoredINDELsVCF.output_vcf_index,
fq_info_destination_table = fq_info_destination_table_vqsr_lite,
filter_schema = fq_info_destination_table_vqsr_lite_schema,
query_project = project_id,
project_id = project_id,
useClassic = false
}

Expand All @@ -179,7 +202,7 @@ workflow GvsCreateFilterSet {
sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
fq_filter_sites_destination_table = fq_filter_sites_destination_table,
query_project = project_id
project_id = project_id
}
}

Expand Down Expand Up @@ -309,7 +332,7 @@ workflow GvsCreateFilterSet {
indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index,
fq_info_destination_table = fq_info_destination_table,
filter_schema = fq_info_destination_table_schema,
query_project = project_id,
project_id = project_id,
useClassic = true
}

Expand All @@ -320,7 +343,7 @@ workflow GvsCreateFilterSet {
sites_only_variant_filtered_vcf = MergeVCFs.output_vcf,
sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index,
fq_filter_sites_destination_table = fq_filter_sites_destination_table,
query_project = project_id
project_id = project_id
}

call PopulateFilterSetTranches as PopulateFilterSetTranchesClassic {
Expand All @@ -330,7 +353,7 @@ workflow GvsCreateFilterSet {
snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]),
indel_recal_tranches = IndelsVariantRecalibrator.tranches,
fq_tranches_destination_table = fq_tranches_destination_table,
query_project = project_id
project_id = project_id
}
}

Expand All @@ -346,7 +369,7 @@ workflow GvsCreateFilterSet {

task ExtractFilterTask {
input {
String query_project
String project_id
String dataset_id
String call_set_identifier

Expand Down Expand Up @@ -393,7 +416,7 @@ task ExtractFilterTask {
~{"--excess-alleles-threshold " + excess_alleles_threshold} \
-L ~{intervals} \
--dataset-id ~{dataset_id} \
--project-id ~{query_project} \
--project-id ~{project_id} \
--cost-observability-tablename ~{cost_observability_tablename} \
--call-set-identifier ~{call_set_identifier} \
--wdl-step GvsCreateFilterSet \
Expand All @@ -402,7 +425,7 @@ task ExtractFilterTask {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2022_10_17_2a8c210ac35094997603259fa1cd784486b92e42"
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
memory: "7 GB"
disks: "local-disk 10 HDD"
bootDiskSizeGb: 15
Expand All @@ -429,7 +452,7 @@ task PopulateFilterSetInfo {
File indel_recal_file
File indel_recal_file_index

String query_project
String project_id

File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
File? gatk_override
Expand All @@ -445,7 +468,7 @@ task PopulateFilterSetInfo {

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

echo "Creating SNPs reacalibration file"
echo "Creating SNPs recalibration file"
gatk --java-options "-Xmx1g" \
CreateFilteringFiles \
--ref-version 38 \
Expand All @@ -455,7 +478,7 @@ task PopulateFilterSetInfo {
-V ~{snp_recal_file} \
-O ~{filter_set_name}.snps.recal.tsv

echo "Creating INDELs reacalibration file"
echo "Creating INDELs racalibration file"
gatk --java-options "-Xmx1g" \
CreateFilteringFiles \
--ref-version 38 \
Expand All @@ -473,7 +496,7 @@ task PopulateFilterSetInfo {
bq_table=$(echo ~{fq_info_destination_table} | sed s/\\./:/)

echo "Loading combined TSV into ~{fq_info_destination_table}"
bq load --project_id=~{query_project} --skip_leading_rows 0 -F "tab" \
bq load --project_id=~{project_id} --skip_leading_rows 0 -F "tab" \
--range_partitioning=location,0,26000000000000,6500000000 \
--clustering_fields=location \
--schema "~{filter_schema}" \
Expand All @@ -482,7 +505,7 @@ task PopulateFilterSetInfo {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2022_10_17_2a8c210ac35094997603259fa1cd784486b92e42"
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
memory: "3500 MB"
disks: "local-disk 250 HDD"
bootDiskSizeGb: 15
Expand All @@ -504,7 +527,7 @@ task PopulateFilterSetSites {
File sites_only_variant_filtered_vcf
File sites_only_variant_filtered_vcf_index

String query_project
String project_id

File? gatk_override
}
Expand All @@ -529,7 +552,7 @@ task PopulateFilterSetSites {
bq_table=$(echo ~{fq_filter_sites_destination_table} | sed s/\\./:/)

echo "Loading filter set sites TSV into ~{fq_filter_sites_destination_table}"
bq load --project_id=~{query_project} --skip_leading_rows 1 -F "tab" \
bq load --project_id=~{project_id} --skip_leading_rows 1 -F "tab" \
--range_partitioning=location,0,26000000000000,6500000000 \
--clustering_fields=location \
--schema "filter_set_name:string,location:integer,filters:string" \
Expand All @@ -538,7 +561,7 @@ task PopulateFilterSetSites {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2022_10_17_2a8c210ac35094997603259fa1cd784486b92e42"
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
memory: "3500 MB"
disks: "local-disk 200 HDD"
bootDiskSizeGb: 15
Expand All @@ -561,7 +584,7 @@ task PopulateFilterSetTranches {
File snp_recal_tranches
File indel_recal_tranches

String query_project
String project_id
}
meta {
# Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution.
Expand All @@ -578,14 +601,14 @@ task PopulateFilterSetTranches {
bq_table=$(echo ~{fq_tranches_destination_table} | sed s/\\./:/)

echo "Loading combined tranches CSV into ~{fq_tranches_destination_table}"
bq load --project_id=~{query_project} --skip_leading_rows 0 -F "," \
bq load --project_id=~{project_id} --skip_leading_rows 0 -F "," \
--schema "filter_set_name:string,target_truth_sensitivity:float,num_known:integer,num_novel:integer,known_ti_tv:float,novel_ti_tv:float,min_vqslod:float,filter_name:string,model:string,accessible_truth_sites:integer,calls_at_truth_sites:integer,truth_sensitivity:float" \
${bq_table} \
~{filter_set_name}.tranches_load.csv > status_load_filter_set_tranches
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2022_10_17_2a8c210ac35094997603259fa1cd784486b92e42"
docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2023_02_15_15677b35536995f14da705ac23c5d1c8b30797ae"
memory: "3500 MB"
disks: "local-disk 200 HDD"
bootDiskSizeGb: 15
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ workflow GvsExtractAvroFilesForHail {

call Utils.ValidateFilterSetName {
input:
data_project = project_id,
dataset_name = dataset_name,
filter_set_name = filter_set_name,
project_id = project_id,
fq_filter_set_info_table = "~{project_id}.~{dataset_name}.filter_set_info",
filter_set_name = filter_set_name
}

call OutputPath {
Expand Down
Loading