Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GvsUnified WDL to wrap the 6 core GVS WDLs [VS-382] #7789

Merged
merged 3 commits into from
Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ workflows:
branches:
- master
- ah_var_store
- rc-remove-r-graphing-from-filter
- name: GvsCreateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Expand Down Expand Up @@ -117,7 +116,6 @@ workflows:
branches:
- master
- ah_var_store
- rc-vs-317-remove-excess-headers
- name: GvsImportGenomes
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsImportGenomes.wdl
Expand Down Expand Up @@ -168,6 +166,13 @@ workflows:
branches:
- master
- ah_var_store
- name: GvsUnified
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsUnified.wdl
filters:
branches:
- master
- ah_var_store
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
3 changes: 2 additions & 1 deletion scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import "GvsCreateTables.wdl" as GvsCreateTables
workflow GvsAssignIds {

input {
Boolean go = true
String dataset_name
String project_id

Expand Down Expand Up @@ -65,7 +66,7 @@ workflow GvsAssignIds {
}

output {
Boolean gvs_ids_created = true
Boolean done = true
File gvs_ids_tsv = AssignIds.gvs_ids_tsv
}
}
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ version 1.0

workflow GvsCreateAltAllele {
input {
Boolean go = true
String dataset_name
String project_id

Expand Down Expand Up @@ -35,6 +36,7 @@ workflow GvsCreateAltAllele {

output {
Array[String] vet_tables_loaded = PopulateAltAlleleTable.done
Boolean done = true
}
}

Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import "GvsUtils.wdl" as Utils

workflow GvsCreateFilterSet {
input {
Boolean go = true
String dataset_name
String project_id

Expand Down Expand Up @@ -252,6 +253,7 @@ workflow GvsCreateFilterSet {
output {
File output_vcf = MergeVCFs.output_vcf
File output_vcf_idx = MergeVCFs.output_vcf_index
Boolean done = true
}
}

Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import "GvsUtils.wdl" as Utils

workflow GvsExtractCallset {
input {
Boolean go = true
String dataset_name
String project_id

Expand Down Expand Up @@ -144,6 +145,7 @@ workflow GvsExtractCallset {
Array[File] output_vcf_indexes = ExtractTask.output_vcf_index
Float total_vcfs_size_mb = SumBytes.total_mb
File manifest = CreateManifest.manifest
Boolean done = true
}
}

Expand Down
5 changes: 3 additions & 2 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ version 1.0
workflow GvsImportGenomes {

input {
Boolean go = true
String dataset_name
String project_id

Expand Down Expand Up @@ -81,7 +82,7 @@ workflow GvsImportGenomes {
}

output {
Boolean loaded_in_gvs = true
Boolean done = true
}
}

Expand Down Expand Up @@ -286,7 +287,7 @@ task LoadData {
cpu: 1
}
output {
String done = "true"
Boolean done = true
}
}

Expand Down

This file was deleted.

2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ version 1.0

workflow GvsPrepareCallset {
input {
Boolean go = true
String project_id
String dataset_name

Expand Down Expand Up @@ -42,6 +43,7 @@ workflow GvsPrepareCallset {

output {
String fq_cohort_extract_table_prefix = PrepareRangesCallsetTask.fq_cohort_extract_table_prefix
Boolean done = true
}
}

Expand Down
159 changes: 159 additions & 0 deletions scripts/variantstore/wdl/GvsUnified.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
version 1.0

import "GvsAssignIds.wdl" as AssignIds
import "GvsImportGenomes.wdl" as ImportGenomes
import "GvsCreateAltAllele.wdl" as CreateAltAllele
import "GvsCreateFilterSet.wdl" as CreateFilterSet
import "GvsPrepareRangesCallset.wdl" as PrepareRangesCallset
import "GvsExtractCallset.wdl" as ExtractCallset

workflow GvsUnified {
input {
# Begin GvsAssignIds
String dataset_name
String project_id

Array[String] external_sample_names
Boolean samples_are_controls = false

File? gatk_override
String? service_account_json_path
# End GvsAssignIds

# Begin GvsImportGenomes
Array[File] input_vcfs
Array[File] input_vcf_indexes
File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"

Int? load_data_preemptible_override
Int? load_data_maxretries_override
# End GvsImportGenomes

# Begin GvsCreateFilterSet
String filter_set_name
Array[String] indel_recalibration_annotation_values = ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"]
Int create_filter_set_scatter_count
Array[String] snp_recalibration_annotation_values = ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"]

Int? INDEL_VQSR_max_gaussians_override = 4
Int? INDEL_VQSR_mem_gb_override
Int? SNP_VQSR_max_gaussians_override = 6
Int? SNP_VQSR_mem_gb_override
# End GvsCreateFilterSet

# Begin GvsPrepareRangesCallset
String extract_table_prefix

String query_project = project_id
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate with that on line 56.
And why is this even needed?

Copy link
Collaborator Author

@mcovarr mcovarr Apr 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! I'm surprised WDL allows completely duplicate declarations like that 😦

The query and destination versions of project and dataset are used in GvsPrepareCallset and GvsExtractCallset, though I do not know circumstances under which they would be used.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are used with GvsExtractCohortFromSampleNames.wdl where the GVS dataset and the delivered/billed datasets might be different.

String destination_project = project_id
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is destination_project needed?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are used with GvsExtractCohortFromSampleNames.wdl where the GVS dataset and the delivered/billed datasets might be different.

String destination_dataset = dataset_name
String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}"

Array[String]? query_labels
File? sample_names_to_extract
# End GvsPrepareRangesCallset

# Begin GvsExtractCallset
Int extract_scatter_count

File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed"

String extract_output_file_base_name = filter_set_name

Int? extract_maxretries_override
Int? extract_preemptible_override
String? extract_output_gcs_dir
Int? split_intervals_disk_size_override
Int? split_intervals_mem_override
# End GvsExtractCallset
}

call AssignIds.GvsAssignIds as AssignIds {
input:
dataset_name = dataset_name,
project_id = project_id,
external_sample_names = external_sample_names,
samples_are_controls = samples_are_controls,
assign_ids_gatk_override = gatk_override,
service_account_json_path = service_account_json_path
}

call ImportGenomes.GvsImportGenomes {
input:
go = AssignIds.done,
dataset_name = dataset_name,
project_id = project_id,
external_sample_names = external_sample_names,
input_vcfs = input_vcfs,
input_vcf_indexes = input_vcf_indexes,
interval_list = interval_list,
load_data_preemptible_override = load_data_preemptible_override,
load_data_maxretries_override = load_data_maxretries_override,
load_data_gatk_override = gatk_override,
service_account_json_path = service_account_json_path
}

call CreateAltAllele.GvsCreateAltAllele {
input:
go = GvsImportGenomes.done,
dataset_name = dataset_name,
project_id = project_id,
service_account_json_path = service_account_json_path
}

call CreateFilterSet.GvsCreateFilterSet {
input:
go = GvsCreateAltAllele.done,
dataset_name = dataset_name,
project_id = project_id,
filter_set_name = filter_set_name,
indel_recalibration_annotation_values = indel_recalibration_annotation_values,
scatter_count = create_filter_set_scatter_count,
snp_recalibration_annotation_values = snp_recalibration_annotation_values,
interval_list = interval_list,
gatk_override = gatk_override,
INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
service_account_json_path = service_account_json_path,
SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override
}

call PrepareRangesCallset.GvsPrepareCallset {
input:
go = GvsCreateFilterSet.done,
project_id = project_id,
dataset_name = dataset_name,
control_samples = samples_are_controls,
extract_table_prefix = extract_table_prefix,
query_project = query_project,
destination_project = destination_project,
destination_dataset = destination_dataset,
fq_temp_table_dataset = fq_temp_table_dataset,
query_labels = query_labels,
sample_names_to_extract = sample_names_to_extract,
service_account_json_path = service_account_json_path
}

call ExtractCallset.GvsExtractCallset {
input:
go = GvsPrepareCallset.done,
dataset_name = dataset_name,
project_id = project_id,
control_samples = samples_are_controls,
extract_table_prefix = extract_table_prefix,
filter_set_name = filter_set_name,
query_project = query_project,
scatter_count = extract_scatter_count,
interval_list = interval_list,
interval_weights_bed = interval_weights_bed,
gatk_override = gatk_override,
output_file_base_name = extract_output_file_base_name,
extract_maxretries_override = extract_maxretries_override,
extract_preemptible_override = extract_preemptible_override,
output_gcs_dir = extract_output_gcs_dir,
service_account_json_path = service_account_json_path,
split_intervals_disk_size_override = split_intervals_disk_size_override,
split_intervals_mem_override = split_intervals_mem_override
}
}