-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AoU GVS Cohort Extract wdl #7242
Changes from 27 commits
b72c5ed
6621160
84ce13a
ce8f26b
f528b12
87ab1db
ee321d6
43aeea8
06d52b8
2496017
dd1d54f
6051a6c
11ecec3
100744d
9eb49c5
3abbda9
a7593b6
fa4068d
48a93ea
33f5104
9ed26dc
d28d4fb
f700fd4
d3bfb28
46105cb
5683c30
e69432a
91ca1f1
6ca3071
b4776a8
e052cdc
13dd121
5601e2f
5ea46ac
aba3686
c5539f9
fbaa396
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,6 @@ workflow GvsExtractCallset { | |
input { | ||
String data_project | ||
String default_dataset | ||
String filter_set_name | ||
|
||
File wgs_intervals | ||
Int scatter_count | ||
|
@@ -16,10 +15,11 @@ workflow GvsExtractCallset { | |
String fq_cohort_extract_table_prefix | ||
String query_project = data_project | ||
|
||
String fq_filter_set_info_table = "~{data_project}.~{default_dataset}.filter_set_info" | ||
String fq_filter_set_site_table = "~{data_project}.~{default_dataset}.filter_set_sites" | ||
String fq_filter_set_tranches_table = "~{data_project}.~{default_dataset}.filter_set_tranches" | ||
Boolean do_not_filter_override = false | ||
String? filter_set_name | ||
String? fq_filter_set_info_table = "~{data_project}.~{default_dataset}.filter_set_info" | ||
String? fq_filter_set_site_table = "~{data_project}.~{default_dataset}.filter_set_sites" | ||
String? fq_filter_set_tranches_table = "~{data_project}.~{default_dataset}.filter_set_tranches" | ||
|
||
# if these are unset, default sensitivity levels will be used | ||
Float? snps_truth_sensitivity_filter_level_override | ||
|
@@ -31,6 +31,7 @@ workflow GvsExtractCallset { | |
File? service_account_json | ||
|
||
String output_file_base_name | ||
String? output_gcs_dir | ||
File? gatk_override | ||
} | ||
|
||
|
@@ -80,9 +81,15 @@ workflow GvsExtractCallset { | |
emit_pls = emit_pls, | ||
service_account_json = service_account_json, | ||
output_file = "${output_file_base_name}_${i}.vcf.gz", | ||
output_gcs_dir = output_gcs_dir, | ||
last_modified_timestamps = [fq_samples_to_extract_table_datetime.last_modified_timestamp, fq_cohort_extract_table_datetime.last_modified_timestamp] | ||
} | ||
} | ||
|
||
output { | ||
Array[File] output_vcfs = ExtractTask.output_vcf | ||
Array[File] output_vcf_indexes = ExtractTask.output_vcf_index | ||
} | ||
} | ||
|
||
################################################################################ | ||
|
@@ -101,14 +108,15 @@ task ExtractTask { | |
String fq_cohort_extract_table | ||
String read_project_id | ||
String output_file | ||
String fq_filter_set_info_table | ||
String fq_filter_set_site_table | ||
String fq_filter_set_tranches_table | ||
String filter_set_name | ||
Float? snps_truth_sensitivity_filter_level | ||
Float? indels_truth_sensitivity_filter_level | ||
String? output_gcs_dir | ||
|
||
Boolean do_not_filter_override | ||
String? fq_filter_set_info_table | ||
String? fq_filter_set_site_table | ||
String? fq_filter_set_tranches_table | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anything that is assigned a default value in the outer workflow should not be optional here. |
||
String? filter_set_name | ||
Float? snps_truth_sensitivity_filter_level | ||
Float? indels_truth_sensitivity_filter_level | ||
|
||
File? excluded_intervals | ||
Boolean? emit_pls | ||
|
@@ -163,6 +171,14 @@ task ExtractTask { | |
--project-id ~{read_project_id} \ | ||
~{true='--emit-pls' false='' emit_pls} \ | ||
${FILTERING_ARGS} | ||
|
||
# Drop trailing slash if one exists | ||
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//') | ||
ericsong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if [ -n "${OUTPUT_GCS_DIR}" ]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since these files are also task outputs, if someone runs with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By the cromwell delocalized results, do you mean the files in the "execution directory" (as Terra calls it)? In my mind, I had always thought of those files as "temporary" as in they're usually created within a TTL folder. I'm realizing now that this may be an incorrect assumption that I carried over from my time working with the long reads methods team. I feel like that might be OK though? Users of the WDL that do not want the duplication can drop the |
||
gsutil cp ~{output_file} ${OUTPUT_GCS_DIR}/ | ||
gsutil cp ~{output_file}.tbi ${OUTPUT_GCS_DIR}/ | ||
fi | ||
>>> | ||
|
||
# ------------------------------------------------ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
version 1.0 | ||
|
||
import "GvsPrepareCallset.wdl" as GvsPrepareCallset | ||
import "GvsExtractCallset.wdl" as GvsExtractCallset | ||
|
||
# Workflow used by AoU to extract variants for a given cohort of sample_names | ||
|
||
workflow GvsExtractCohortFromSampleNames { | ||
|
||
input { | ||
File cohort_sample_names | ||
String query_project | ||
String gvs_project | ||
String gvs_dataset | ||
String fq_gvs_extraction_cohorts_dataset | ||
String fq_gvs_extraction_destination_dataset | ||
String fq_gvs_extraction_temp_tables_dataset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can make these 2 optional since they are defaulted in the called workflow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch. removed. regarding the two optionals though, do you mean the arguments on lines 16 and 17? those are passed directly into Prepare There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. if you make |
||
String extraction_uuid | ||
String? output_gcs_dir | ||
|
||
# Extract parameters | ||
File wgs_intervals | ||
Int scatter_count | ||
|
||
File reference | ||
File reference_index | ||
File reference_dict | ||
|
||
String output_file_base_name | ||
|
||
Boolean do_not_filter_override = false | ||
String? filter_set_name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should not be optional. or if optional, should default to a valid filter set name |
||
String fq_filter_set_info_table = "~{gvs_project}.~{gvs_dataset}.filter_set_info" | ||
String fq_filter_set_site_table = "~{gvs_project}.~{gvs_dataset}.filter_set_sites" | ||
String fq_filter_set_tranches_table = "~{gvs_project}.~{gvs_dataset}.filter_set_tranches" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need to set these values or pass them to the workflow since they are defaulted in the workflow. If you want to allow the caller to set them, you should make them optional and then you would pass them into the workflow. |
||
|
||
# if these are unset, default sensitivity levels will be used | ||
Float? snps_truth_sensitivity_filter_level_override | ||
Float? indels_truth_sensitivity_filter_level_override | ||
|
||
File? gatk_override | ||
} | ||
|
||
call GvsPrepareCallset.GvsPrepareCallset { | ||
input: | ||
destination_cohort_table_prefix = extraction_uuid, | ||
sample_names_to_extract = cohort_sample_names, | ||
data_project = query_project, | ||
default_dataset = gvs_dataset, # unused if fq_* args are given | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even though it's unused here, it still needs to be passed into the task |
||
fq_petvet_dataset = "~{gvs_project}.~{gvs_dataset}", | ||
fq_sample_mapping_table = "~{gvs_project}.~{gvs_dataset}.sample_info", | ||
fq_temp_table_dataset = fq_gvs_extraction_temp_tables_dataset, | ||
fq_destination_dataset = fq_gvs_extraction_destination_dataset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe make these parameters in the definition of the task optional, since they are assigned defaults and then you won't have to pass the hard coded values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so these do have defaults in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, well then maybe add a comment that you are explicitly overriding the default values |
||
} | ||
|
||
call GvsExtractCallset.GvsExtractCallset { | ||
input: | ||
data_project = gvs_project, # unused if fq_filter_set_* args are given or filtering is off | ||
query_project = query_project, | ||
default_dataset = gvs_dataset, # unused if fq_filter_set_* args are given or filtering is off | ||
|
||
wgs_intervals = wgs_intervals, | ||
scatter_count = scatter_count, | ||
|
||
reference = reference, | ||
reference_index = reference_index, | ||
reference_dict = reference_dict, | ||
|
||
fq_cohort_extract_table_prefix = GvsPrepareCallset.fq_cohort_extract_table_prefix, | ||
|
||
do_not_filter_override = do_not_filter_override, | ||
filter_set_name = filter_set_name, | ||
fq_filter_set_info_table = fq_filter_set_info_table, | ||
fq_filter_set_site_table = fq_filter_set_site_table, | ||
fq_filter_set_tranches_table = fq_filter_set_tranches_table, | ||
snps_truth_sensitivity_filter_level_override = snps_truth_sensitivity_filter_level_override, | ||
indels_truth_sensitivity_filter_level_override = indels_truth_sensitivity_filter_level_override, | ||
|
||
output_file_base_name = output_file_base_name, | ||
output_gcs_dir = output_gcs_dir, | ||
gatk_override = gatk_override | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,10 @@ workflow GvsPrepareCallset { | |
docker = docker_final | ||
} | ||
|
||
output { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good improvement here |
||
String fq_cohort_extract_table_prefix = PrepareCallsetTask.fq_cohort_extract_table_prefix | ||
} | ||
|
||
} | ||
|
||
task PrepareCallsetTask { | ||
|
@@ -82,6 +86,10 @@ task PrepareCallsetTask { | |
~{"--sa_key_path " + service_account_json} | ||
>>> | ||
|
||
output { | ||
String fq_cohort_extract_table_prefix = "~{fq_destination_dataset}.~{destination_cohort_table_prefix}" # implementation detail of create_cohort_extract_data_table.py | ||
} | ||
|
||
runtime { | ||
docker: docker | ||
memory: "3 GB" | ||
|
@@ -93,5 +101,3 @@ task PrepareCallsetTask { | |
|
||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Filter set name should not be optional. This will be named something that indicates which release of the filtering model the extract should use.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should it be set even if filtering is not going to be used? for example, if do_not_filter_override is true, the argument will never be used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, i didn't catch that. It's fine to be optional.