forked from GoogleCloudPlatform/ai-on-gke
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tf
219 lines (196 loc) · 9.35 KB
/
main.tf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#######################################################
#### APPLICATIONS
#######################################################
provider "google" {
project = var.project_id
}
provider "time" {}
data "google_client_config" "default" {}
data "google_project" "project" {
project_id = var.project_id
}
## Enable Required GCP Project Services APIs
module "project-services" {
source = "terraform-google-modules/project-factory/google//modules/project_services"
version = "~> 14.5"
project_id = var.project_id
disable_services_on_destroy = false
disable_dependent_services = false
activate_apis = flatten([
"autoscaling.googleapis.com",
"cloudbuild.googleapis.com",
"cloudresourcemanager.googleapis.com",
"compute.googleapis.com",
"config.googleapis.com",
"connectgateway.googleapis.com",
"container.googleapis.com",
"containerfilesystem.googleapis.com",
"dns.googleapis.com",
"gkehub.googleapis.com",
"iamcredentials.googleapis.com",
"logging.googleapis.com",
"monitoring.googleapis.com",
"pubsub.googleapis.com",
"servicenetworking.googleapis.com",
"serviceusage.googleapis.com",
"sourcerepo.googleapis.com",
"iap.googleapis.com"
])
}
module "infra" {
source = "../../infrastructure"
count = var.create_cluster ? 1 : 0
project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
network_name = "default"
subnetwork_name = "default"
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
ray_addon_enabled = true
depends_on = [module.project-services]
}
data "google_container_cluster" "default" {
count = var.create_cluster ? 0 : 1
name = var.cluster_name
location = var.cluster_location
depends_on = [module.project-services]
}
locals {
endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}"
ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate)
private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint
cluster_membership_id = var.cluster_membership_id == "" ? local.cluster_name : var.cluster_membership_id
enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot
enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu
host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint
kubernetes_namespace = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.kubernetes_namespace}" : var.kubernetes_namespace
workload_identity_service_account = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.workload_identity_service_account}" : var.workload_identity_service_account
cluster_name = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.cluster_name}" : var.cluster_name
ray_cluster_default_uri = "https://console.cloud.google.com/kubernetes/service/${var.cluster_location}/${local.cluster_name}/${local.kubernetes_namespace}/${var.ray_cluster_name}-kuberay-head-svc/overview?project=${var.project_id}"
}
provider "kubernetes" {
alias = "ray"
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = local.private_cluster ? "" : local.ca_certificate
dynamic "exec" {
for_each = local.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
provider "helm" {
alias = "ray"
kubernetes {
host = local.host
token = data.google_client_config.default.access_token
cluster_ca_certificate = local.private_cluster ? "" : local.ca_certificate
dynamic "exec" {
for_each = local.private_cluster ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "gke-gcloud-auth-plugin"
}
}
}
}
module "namespace" {
source = "../../modules/kubernetes-namespace"
providers = { helm = helm.ray }
create_namespace = true
namespace = local.kubernetes_namespace
}
module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.ray }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts
use_existing_gcp_sa = !var.create_service_account
name = local.workload_identity_service_account
namespace = local.kubernetes_namespace
project_id = var.project_id
roles = ["roles/cloudsql.client", "roles/monitoring.viewer"]
automount_service_account_token = true
depends_on = [module.namespace]
}
module "kuberay-monitoring" {
count = var.create_ray_cluster ? 1 : 0
source = "../../modules/kuberay-monitoring"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
project_id = var.project_id
autopilot_cluster = var.autopilot_cluster
namespace = local.kubernetes_namespace
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
depends_on = [module.kuberay-workload-identity]
}
module "gcs" {
source = "../../modules/gcs"
count = var.create_gcs_bucket ? 1 : 0
project_id = var.project_id
bucket_name = var.gcs_bucket
}
module "kuberay-cluster" {
count = var.create_ray_cluster == true ? 1 : 0
source = "../../modules/kuberay-cluster"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = var.ray_cluster_name
namespace = local.kubernetes_namespace
project_id = var.project_id
enable_tpu = local.enable_tpu
enable_gpu = var.enable_gpu
gcs_bucket = var.gcs_bucket
autopilot_cluster = local.enable_autopilot
google_service_account = local.workload_identity_service_account
grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : ""
network_policy_allow_cidr = var.kuberay_network_policy_allow_cidr
disable_network_policy = var.disable_ray_cluster_network_policy
additional_labels = var.additional_labels
# IAP Auth parameters
add_auth = var.ray_dashboard_add_auth
create_brand = var.create_brand
support_email = var.support_email
client_id = var.ray_dashboard_client_id
client_secret = var.ray_dashboard_client_secret
k8s_ingress_name = var.ray_dashboard_k8s_ingress_name
k8s_iap_secret_name = var.ray_dashboard_k8s_iap_secret_name
k8s_managed_cert_name = var.ray_dashboard_k8s_managed_cert_name
k8s_backend_config_name = var.ray_dashboard_k8s_backend_config_name
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
depends_on = [module.gcs, module.kuberay-workload-identity]
}
# Assign resource quotas to Ray namespace to ensure that they don't overutilize resources
resource "kubernetes_resource_quota" "ray_namespace_resource_quota" {
provider = kubernetes.ray
count = var.disable_resource_quotas ? 0 : 1
metadata {
name = "ray-resource-quota"
namespace = local.kubernetes_namespace
}
spec {
hard = var.resource_quotas
}
depends_on = [module.namespace]
}