Skip to content

Commit

Permalink
Add machine type- availability zone precondition. Validate infra is p…
Browse files Browse the repository at this point in the history
…rovisioned only if the machine exists in one of the zones specified

Update tools/cloud-build/daily-tests/tests/hcls.yml

Co-authored-by: Tom Downes <[email protected]>

Update tools/cloud-build/daily-tests/tests/hcls.yml

Co-authored-by: Tom Downes <[email protected]>

Use europe-west1-d in reservations.tf
  • Loading branch information
annuay-google committed Sep 11, 2024
1 parent b827380 commit 62acb6a
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,15 @@ modules. For support with the underlying modules, see the instructions in the

| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.4 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 5.11 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | >= 5.11 |
| <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |

## Modules

Expand All @@ -147,7 +148,9 @@ No modules.

| Name | Type |
|------|------|
| [terraform_data.machine_type_zone_validation](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source |
| [google_compute_machine_types.machine_types_by_zone](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_machine_types) | data source |
| [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source |
| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source |

Expand Down
27 changes: 26 additions & 1 deletion community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ locals {
gpu = one(local.guest_accelerator)

labels = local.labels
machine_type = var.machine_type
machine_type = terraform_data.machine_type_zone_validation.output
metadata = local.metadata
min_cpu_platform = var.min_cpu_platform

Expand Down Expand Up @@ -171,3 +171,28 @@ data "google_compute_reservation" "reservation" {
# Add a validation that if reservation.project != var.project_id it should be a shared reservation
}
}

data "google_compute_machine_types" "machine_types_by_zone" {
for_each = local.zones
filter = format("name = \"%s\"", var.machine_type)
zone = each.value
}

locals {
machine_types_by_zone = data.google_compute_machine_types.machine_types_by_zone
zones_with_machine_type = [for k, v in local.machine_types_by_zone : k if length(v.machine_types) > 0]
}

resource "terraform_data" "machine_type_zone_validation" {
input = var.machine_type
lifecycle {
precondition {
condition = length(local.zones_with_machine_type) > 0
error_message = <<-EOT
machine type ${var.machine_type} is not available in any of the zones ${jsonencode(local.zones)}". To list zones in which it is available, run:
gcloud compute machine-types list --filter="name=${var.machine_type}"
EOT
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

terraform {
required_version = ">= 1.3"
required_version = ">= 1.4"

required_providers {
google = {
Expand Down
2 changes: 1 addition & 1 deletion tools/cloud-build/daily-tests/tests/hcls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ deployment_name: "hcls-v6-{{ build }}"
# No non-alphanumerical characters in the slurm cluster name - they will be
# removed by Cluster Toolkit slurm wrappers, which will break the playbook
slurm_cluster_name: "hclsv6{{ build[0:4] }}"
zone: europe-west1-d
zone: europe-west1-c
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/hcls-blueprint.yaml"
network: "{{ test_name }}-net"
Expand Down
6 changes: 3 additions & 3 deletions tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ deployment_name: "enter-{{ build }}"
# Manually adding the slurm_cluster_name for use in node names, which filters
# non-alphanumeric chars and is capped at 10 chars.
slurm_cluster_name: "enter{{ build[0:5] }}"
zone: europe-west1-d
zone: europe-west4-c
cli_deployment_vars:
network_name: "{{ network }}"
region: europe-west1
region: europe-west4
zone: "{{ zone }}"
gpu_zones: "[europe-west1-b,europe-west1-c,europe-west1-d]"
gpu_zones: "[europe-west4-a,europe-west4-b,europe-west4-c]"
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/hpc-enterprise-slurm.yaml"
network: "{{ test_name }}-net"
Expand Down
6 changes: 3 additions & 3 deletions tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ slurm_cluster_name: "rock8{{ build[0:5] }}"

cli_deployment_vars:
network_name: "{{ network }}"
region: us-west4
zone: us-west4-c
region: us-central1
zone: us-central1-a

zone: us-west4-c
zone: us-central1-a
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml"
network: "{{ test_name }}-net"
Expand Down
4 changes: 2 additions & 2 deletions tools/cloud-build/provision/reservations.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ resource "google_compute_reservation" "c2standard60_us_west4_c" {
}

resource "google_compute_reservation" "n1standard8_with_tesla_t4_europe_west1_d" {
name = "n1standard8-with-tesla-t4-europe-west1-d"
zone = "europe-west1-d"
name = "n1standard8-with-tesla-t4-europe-west1-c"
zone = "europe-west1-c"
description = local.reservation_description

specific_reservation {
Expand Down

0 comments on commit 62acb6a

Please sign in to comment.