-
Notifications
You must be signed in to change notification settings - Fork 598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Core storage model cost [VS-473] #7913
Changes from 8 commits
ea0057f
e802c22
a6432d6
13fbbcf
28391fb
daf3945
45da559
9e2bb23
3615581
863abf9
556cd01
711ba6e
efdadbc
68c9e2b
aad4ef8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,11 @@ version 1.0 | |
|
||
workflow GvsCallsetCost { | ||
input { | ||
# String project_id | ||
# String dataset_name | ||
String project_id | ||
String dataset_name | ||
String workspace_namespace | ||
String workspace_name | ||
# String callset_name | ||
String callset_identifier | ||
Array[String] excluded_submission_ids = [] | ||
} | ||
|
||
|
@@ -17,29 +17,32 @@ workflow GvsCallsetCost { | |
excluded_submission_ids = excluded_submission_ids | ||
} | ||
|
||
# call BigQueryWriteAPICost { | ||
# input: | ||
# project_id = project_id, | ||
# dataset_name = dataset_name | ||
# } | ||
# | ||
call CoreStorageModelSizes { | ||
input: | ||
project_id = project_id, | ||
dataset_name = dataset_name | ||
} | ||
|
||
# call BigQueryScannedCost { | ||
# input: | ||
# project_id = project_id, | ||
# dataset_name = dataset_name, | ||
# callset_name = callset_name | ||
# callset_identifier = callset_identifier | ||
# } | ||
# | ||
# call BigQueryStorageAPIScannedCost { | ||
# input: | ||
# project_id = project_id, | ||
# dataset_name = dataset_name, | ||
# callset_name = callset_name | ||
# callset_identifier = callset_identifier | ||
# } | ||
|
||
output { | ||
File workflow_compute_costs = WorkflowComputeCosts.costs | ||
File workflow_compute_costs_log = WorkflowComputeCosts.log | ||
String vet_gib = CoreStorageModelSizes.vet_gib | ||
String ref_ranges_gib = CoreStorageModelSizes.ref_ranges_gib | ||
String alt_allele_gib = CoreStorageModelSizes.alt_allele_gib | ||
} | ||
} | ||
|
||
|
@@ -75,29 +78,93 @@ task WorkflowComputeCosts { | |
} | ||
} | ||
|
||
#task BigQueryWriteAPICost { | ||
# meta { | ||
# description: "Estimate GvsImportGenomes use of the BQ Write API via core storage costs from the sizes of vet_% and ref_ranges_% tables." | ||
# volatile: true | ||
# } | ||
# | ||
# input { | ||
# String project_id | ||
# String dataset_name | ||
# } | ||
# command <<< | ||
# >>> | ||
# | ||
# runtime { | ||
# docker: "" | ||
# } | ||
# | ||
# output { | ||
# Float vet_gib = read_float("") | ||
# Float ref_ranges_gib = read_float("") | ||
# Float import_genomes_cost = 3 | ||
# } | ||
#} | ||
task CoreStorageModelSizes { | ||
input { | ||
String project_id | ||
String dataset_name | ||
} | ||
meta { | ||
description: "Read sizes of vet_%, ref_ranges_%, and alt_allele tables from `INFORMATION_SCHEMA.PARTITIONS`." | ||
# Definitely don't cache this, the values will change while the inputs to this task will not! | ||
volatile: true | ||
} | ||
command <<< | ||
|
||
sanity_check_project() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why validate the project id? And if there are good reasons to validate it, why just here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gets interpolated into various BQ queries and given all the recent Hack EDU training I thought a little validation might be in order. 🙂 I was working on centralizing all input validation to one task in my next PR with downstream tasks only running if that succeeds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually since I have reviewers' attention now I'll just mosey that code on from my in-progress branch to this one... 🙂 |
||
local -n outfail="fail" | ||
|
||
# Technically single quotes and exclamation points are allowed but none of that nonsense here. | ||
# https://cloud.google.com/resource-manager/docs/creating-managing-projects#:~:text=A%20project%20name%20can%20contain,between%204%20and%2030%20characters. | ||
valid='-_0-9a-zA-Z' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you! |
||
|
||
if [[ "~{project_id}" =~ [^$valid] ]] | ||
then | ||
echo "Invalid project name '~{project_id}': contains invalid characters, valid characters in [$valid]." | ||
outfail=1 | ||
fi | ||
|
||
project_id='~{project_id}' | ||
project_id_length=${#project_id} | ||
if [[ $project_id_length -lt 4 ]] || [[ $project_id_length -gt 30 ]] | ||
then | ||
echo "Invalid project name '~{project_id}', length must be between 4 and 30 characters inclusive." | ||
outfail=1 | ||
fi | ||
} | ||
|
||
sanity_check_dataset_name() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see questions about |
||
local -n outfail="fail" | ||
|
||
valid="0-9A-Za-z_" | ||
|
||
if [[ "~{dataset_name}" =~ [^$valid] ]] | ||
then | ||
echo "Invalid dataset name '~{dataset_name}': contains invalid characters, valid characters in [$valid]." | ||
outfail=1 | ||
fi | ||
|
||
dataset_name='~{dataset_name}' | ||
dataset_name_length=${#dataset_name} | ||
if [[ $dataset_name_length -lt 1 ]] || [[ $dataset_name_length -gt 1024 ]] | ||
then | ||
echo "Invalid dataset name '~{dataset_name}': length must be between 1 and 1024 characters inclusive." | ||
outfail=1 | ||
fi | ||
} | ||
|
||
get_billable_bytes_in_gib() { | ||
local table_pattern="$1" | ||
local output_file_name="$2" | ||
|
||
bq query --location=US --project_id='~{project_id}' --format=csv --use_legacy_sql=false \ | ||
"SELECT round(sum(total_billable_bytes) / (1024*1024*1024),2) \ | ||
FROM \`~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ | ||
WHERE table_name LIKE '${table_pattern}'" | tail -1 > ${output_file_name} | ||
} | ||
|
||
fail=0 | ||
|
||
sanity_check_project | ||
sanity_check_dataset_name | ||
|
||
if [[ $fail -eq 1 ]] | ||
then | ||
exit 1 | ||
fi | ||
|
||
get_billable_bytes_in_gib "vet_%" vet_gib.txt | ||
get_billable_bytes_in_gib "ref_ranges_%" ref_ranges_gib.txt | ||
get_billable_bytes_in_gib "alt_allele" alt_allele_gib.txt | ||
>>> | ||
runtime { | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:390.0.0" | ||
} | ||
output { | ||
Float vet_gib = read_float("vet_gib.txt") | ||
Float ref_ranges_gib = read_float("ref_ranges_gib.txt") | ||
Float alt_allele_gib = read_float("alt_allele_gib.txt") | ||
} | ||
} | ||
|
||
#task BigQueryScannedCost { | ||
# meta { | ||
|
@@ -108,7 +175,7 @@ task WorkflowComputeCosts { | |
# input { | ||
# String project_id | ||
# String dataset_name | ||
# String callset_name | ||
# String callset_identifier | ||
# } | ||
# | ||
# command <<< | ||
|
@@ -135,7 +202,7 @@ task WorkflowComputeCosts { | |
# input { | ||
# String project_id | ||
# String dataset_name | ||
# String callset_name | ||
# String callset_identifier | ||
# } | ||
# | ||
# command <<< | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: change to call_set_identifier to be consistent with table field?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah this was bugging me too... I was wondering whether to standardize on
call_set
orcallset
, sounds like you prefer the former.