Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core storage model cost [VS-473] #7913

Merged
merged 15 commits into from
Jun 28, 2022
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ workflows:
- master
- ah_var_store
- vs_472_workflow_compute_costs
- vs_473_core_storage_model_cost
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
194 changes: 157 additions & 37 deletions scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,151 @@ version 1.0

workflow GvsCallsetCost {
input {
# String project_id
# String dataset_name
String project_id
String dataset_name
String workspace_namespace
String workspace_name
# String callset_name
String call_set_identifier
Array[String] excluded_submission_ids = []
}

call ValidateInputs {
input:
project_id = project_id,
dataset_name = dataset_name,
workspace_namespace = workspace_namespace,
workspace_name = workspace_name,
call_set_identifier = call_set_identifier
}

call WorkflowComputeCosts {
input:
go = ValidateInputs.done,
workspace_namespace = workspace_namespace,
workspace_name = workspace_name,
excluded_submission_ids = excluded_submission_ids
}

# call BigQueryWriteAPICost {
# input:
# project_id = project_id,
# dataset_name = dataset_name
# }
#
call CoreStorageModelSizes {
input:
go = ValidateInputs.done,
project_id = project_id,
dataset_name = dataset_name
}

# call BigQueryScannedCost {
# input:
# project_id = project_id,
# dataset_name = dataset_name,
# callset_name = callset_name
# call_set_identifier = call_set_identifier
# }
#
# call BigQueryStorageAPIScannedCost {
# input:
# project_id = project_id,
# dataset_name = dataset_name,
# callset_name = callset_name
# call_set_identifier = call_set_identifier
# }

output {
File workflow_compute_costs = WorkflowComputeCosts.costs
File workflow_compute_costs_log = WorkflowComputeCosts.log
String vet_gib = CoreStorageModelSizes.vet_gib
String ref_ranges_gib = CoreStorageModelSizes.ref_ranges_gib
String alt_allele_gib = CoreStorageModelSizes.alt_allele_gib
}
}

task ValidateInputs {
meta {
description: "Sanity check inputs before running anything"
# OK for this to call cache so it's not `volatile`.
}
input {
String project_id
String dataset_name
String workspace_namespace
String workspace_name
String call_set_identifier
}
command <<<

sanity_check_input() {
local -n outfail="fail"
local description="$1"
local input="$2"
local valid_characters="$3"
local minimum_length="$4"
local maximum_length="$5"

# Do not check for valid characters if the `valid_characters` variable is empty.
if [[ ${#valid_characters} -gt 0 ]]
then
if [[ "${input}" =~ [^${valid_characters}] ]]
then
echo "Invalid ${description} '${input}': contains invalid characters, valid characters in [${valid_characters}]."
outfail=1
fi
fi

local input_length=${#input}
if [[ ${input_length} -lt ${minimum_length} ]] || [[ ${input_length} -gt ${maximum_length} ]]
then
echo "Invalid ${description} '$input', length must be between ${minimum_length} and ${maximum_length} characters inclusive."
outfail=1
fi
}

fail=0

# Technically single quotes and exclamation points are allowed in project ids but none of that nonsense here.
# https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters.
sanity_check_input \
"project id" \
"~{project_id}" \
'-_0-9a-zA-Z' \
4 \
30

sanity_check_input \
"dataset name" \
"~{dataset_name}" \
"0-9A-Za-z_" \
1 \
1024

# The following non-Google restrictions are arbitrary but comforting and could be relaxed.
sanity_check_input \
"call set identifier" \
"~{call_set_identifier}" \
'-_0-9a-zA-Z' \
1 \
100

sanity_check_input \
"workspace namespace" \
"~{workspace_namespace}" \
'' \
1 \
100

sanity_check_input \
"workspace name" \
"~{workspace_name}" \
'' \
1 \
100

if [[ $fail -eq 1 ]]
then
exit 1
fi
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0"
}
output {
Boolean done = true
}
}

Expand All @@ -48,8 +155,8 @@ task WorkflowComputeCosts {
description: "Calculate workflow compute costs by calling Firecloud APIs for submissions in the specified workspace"
volatile: true
}

input {
Boolean go = true
String workspace_namespace
String workspace_name
Array[String] excluded_submission_ids
Expand All @@ -75,29 +182,42 @@ task WorkflowComputeCosts {
}
}

#task BigQueryWriteAPICost {
# meta {
# description: "Estimate GvsImportGenomes use of the BQ Write API via core storage costs from the sizes of vet_% and ref_ranges_% tables."
# volatile: true
# }
#
# input {
# String project_id
# String dataset_name
# }
# command <<<
# >>>
#
# runtime {
# docker: ""
# }
#
# output {
# Float vet_gib = read_float("")
# Float ref_ranges_gib = read_float("")
# Float import_genomes_cost = 3
# }
#}
task CoreStorageModelSizes {
meta {
description: "Read sizes of vet_%, ref_ranges_%, and alt_allele tables from `INFORMATION_SCHEMA.PARTITIONS`."
# Definitely don't cache this, the values will change while the inputs to this task will not!
volatile: true
}
input {
Boolean go = true
String project_id
String dataset_name
}
command <<<

get_billable_bytes_in_gib() {
local table_pattern="$1"
local output_file_name="$2"

bq query --location=US --project_id='~{project_id}' --format=csv --use_legacy_sql=false \
"SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \
FROM \`~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \
WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name}
}

get_billable_bytes_in_gib "vet_%" vet_gib.txt
get_billable_bytes_in_gib "ref_ranges_%" ref_ranges_gib.txt
get_billable_bytes_in_gib "alt_allele" alt_allele_gib.txt
>>>
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0"
}
output {
Float vet_gib = read_float("vet_gib.txt")
Float ref_ranges_gib = read_float("ref_ranges_gib.txt")
Float alt_allele_gib = read_float("alt_allele_gib.txt")
}
}

#task BigQueryScannedCost {
# meta {
Expand All @@ -108,7 +228,7 @@ task WorkflowComputeCosts {
# input {
# String project_id
# String dataset_name
# String callset_name
# String call_set_identifier
# }
#
# command <<<
Expand All @@ -135,7 +255,7 @@ task WorkflowComputeCosts {
# input {
# String project_id
# String dataset_name
# String callset_name
# String call_set_identifier
# }
#
# command <<<
Expand Down