GoogleCloudPlatform · gongmax · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/applications/jupyter/README.md b/applications/jupyter/README.md
@@ -20,7 +20,8 @@ This module deploys the following resources, once per user:
     - iap.googleapis.com (required when using authentication with Identity Aware Proxy)
 
 2. A functional GKE cluster.
-    - To create a new standard or autopilot cluster, follow the instructions under `infrastructure/README.md`
+    - To create a new standard or autopilot cluster, follow the instructions in [`infrastructure/README.md`](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md)
+    - Alternatively, you can set the `create_cluster` variable to true in `workloads.tfvars` to provision a new GKE cluster. This will default to creating a GKE Autopilot cluster; if you want to provision a standard cluster you must also set `autopilot_cluster` to false.
 
 3. This module is configured to use Identity Aware Proxy (IAP) as default authentication method for JupyterHub. It expects the brand & the OAuth consent configured in your org. You can check the details here: [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent)
 
@@ -116,7 +117,7 @@ gcloud auth application-default login
 1. Extract the randomly generated password for JupyterHub login
 
 ```
-terraform output password
+terraform output jupyterhub_password
 ```
 
 2. Setup port forwarding for the frontend: `kubectl port-forward service/proxy-public -n <namespace> 8081:80 &`, and open `localhost:8081` in a browser.
@@ -192,4 +193,4 @@ This module uses `<ip>.nip.io` as the domain name with a global static ipv4 addr
 
 ## Additional Information
 
-For more information about JupyterHub profiles and the preset profiles visit [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/jupyter/profiles.md)
+For more information about JupyterHub profiles and the preset profiles visit [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/jupyter/profiles.md)
diff --git a/applications/jupyter/variables.tf b/applications/jupyter/variables.tf
@@ -150,7 +150,7 @@ variable "private_cluster" {
 
 variable "autopilot_cluster" {
   type    = bool
-  default = false
+  default = true
 }
 
 variable "cpu_pools" {

diff --git a/applications/jupyter/workloads.tfvars b/applications/jupyter/workloads.tfvars
@@ -13,13 +13,18 @@
 # limitations under the License.
 
 ##common variables
-## Need to pull this variables from tf output from previous platform stage
+## Need to pull this variables from tf output from previous infrastructure stage
 project_id = "<your project ID>"
 
-## this is required for terraform to connect to GKE master and deploy workloads
-create_cluster        = false # this flag will create a new standard public gke cluster in default network
-cluster_name          = "ml-cluster"
-cluster_location      = "us-central1"
+## This is required for terraform to connect to GKE cluster and deploy workloads.
+cluster_name     = "ml-cluster"
+cluster_location = "us-central1"
+
+## If terraform should create a new GKE cluster, fill in this section as well.
+##    By default, a public autopilot GKE cluster will be created in the default network.
+##    Set the autopilot_cluster variable to false to create a standard cluster instead.
+create_cluster        = false
+autopilot_cluster     = true
 cluster_membership_id = "" # required for private cluster, defaults to `cluster_name`
 
 #######################################################

diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf
@@ -378,7 +378,7 @@ variable "gpu_pools" {
     max_count              = optional(number, 3)
     local_ssd_count        = optional(number, 0)
     spot                   = optional(bool, false)
-    disk_size_gb           = optional(number, 100)
+    disk_size_gb           = optional(number, 200)
     disk_type              = optional(string, "pd-standard")
     image_type             = optional(string, "COS_CONTAINERD")
     enable_gcfs            = optional(bool, false)
@@ -399,7 +399,7 @@ variable "gpu_pools" {
     autoscaling        = true
     min_count          = 1
     max_count          = 3
-    disk_size_gb       = 100
+    disk_size_gb       = 200
     disk_type          = "pd-balanced"
     enable_gcfs        = true
     accelerator_count  = 2

diff --git a/applications/ray/README.md b/applications/ray/README.md
@@ -3,22 +3,58 @@
 This repository contains a Terraform template for running [Ray](https://www.ray.io/) on Google Kubernetes Engine.
 See the [Ray on GKE](/ray-on-gke/) directory to see additional guides and references.
 
+## Prerequisites
+
+1. GCP Project with following APIs enabled
+    - container.googleapis.com
+    - iap.googleapis.com (required when using authentication with Identity Aware Proxy)
+
+2. A functional GKE cluster.
+    - To create a new standard or autopilot cluster, follow the instructions in [`infrastructure/README.md`](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md)
+    - Alternatively, you can set the `create_cluster` variable to true in `workloads.tfvars` to provision a new GKE cluster. This will default to creating a GKE Autopilot cluster; if you want to provision a standard cluster you must also set `autopilot_cluster` to false.
+
+3. This module is configured to optionally use Identity Aware Proxy (IAP) to protect access to the Ray dashboard. It expects the brand & the OAuth consent configured in your org. You can check the details here: [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent)
+
+4. Preinstall the following on your computer:
+    * Terraform
+    * Gcloud CLI
+
 ## Installation
 
-Preinstall the following on your computer:
-* Terraform
-* Gcloud
+### Configure Inputs
 
-> **_NOTE:_** Terraform keeps state metadata in a local file called `terraform.tfstate`. Deleting the file may cause some resources to not be cleaned up correctly even if you delete the cluster. We suggest using `terraform destory` before reapplying/reinstalling.
+1. If needed, clone the repo
+```
+git clone https://github.com/GoogleCloudPlatform/ai-on-gke
+cd ai-on-gke/applications/ray
+```
 
-1. If needed, git clone https://github.com/GoogleCloudPlatform/ai-on-gke
+2. Edit `workloads.tfvars` with your GCP settings.
+
+**Important Note:**
+If using this with the Jupyter module (`applications/jupyter/`), it is recommended to use the same k8s namespace
+for both i.e. set this to the same namespace as `applications/jupyter/workloads.tfvars`.
+
+| Variable                    | Description                                                                                                    | Required |
+|-----------------------------|----------------------------------------------------------------------------------------------------------------|:--------:|
+| project_id                  | GCP Project Id                                                                                                 | Yes      |
+| cluster_name                | GKE Cluster Name                                                                                               | Yes      |
+| cluster_location            | GCP Region                                                                                                     | Yes      |
+| kubernetes_namespace        | The namespace that Ray and rest of the other resources will be installed in.                                   | Yes      |
+| gcs_bucket                  | GCS bucket to be used for Ray storage                                                                          | Yes      |
+| create_service_account      | Create service accounts used for Workload Identity mapping                                                     | Yes      |
+
+
+### Install
+
+> **_NOTE:_** Terraform keeps state metadata in a local file called `terraform.tfstate`. Deleting the file may cause some resources to not be cleaned up correctly even if you delete the cluster. We suggest using `terraform destory` before reapplying/reinstalling.
 
-2. `cd applications/ray`
+3. Ensure your gcloud application default credentials are in place. 
+```
+gcloud auth application-default login
+```
 
-3. Find the name and location of the GKE cluster you want to use.
-   Run `gcloud container clusters list --project=<your GCP project>` to see all the available clusters.
-   _Note: If you created the GKE cluster via the infrastructure repo, you can get the cluster info from `platform.tfvars`_
+4. Run `terraform init`
 
-4. Edit `workloads.tfvars` with your environment specific variables and configurations.
+5. Run `terraform apply --var-file=./workloads.tfvars`. 
 
-5. Run `terraform init && terraform apply --var-file workloads.tfvars`
diff --git a/applications/ray/variables.tf b/applications/ray/variables.tf
@@ -39,7 +39,7 @@ variable "ray_version" {
 variable "kubernetes_namespace" {
   type        = string
   description = "Kubernetes namespace where resources are deployed"
-  default     = "myray"
+  default     = "ml"
 }
 
 variable "enable_grafana_on_ray_dashboard" {
@@ -105,7 +105,7 @@ variable "private_cluster" {
 
 variable "autopilot_cluster" {
   type    = bool
-  default = false
+  default = true
 }
 
 variable "cpu_pools" {

diff --git a/applications/ray/workloads.tfvars b/applications/ray/workloads.tfvars
@@ -17,11 +17,16 @@
 ## Need to pull this variables from tf output from previous platform stage
 project_id = "<your project ID>"
 
-## this is required for terraform to connect to GKE master and deploy workloads
-create_cluster   = false # this flag will create a new standard public gke cluster in default network
+## This is required for terraform to connect to GKE cluster and deploy workloads.
 cluster_name     = "<cluster name>"
 cluster_location = "us-central1"
 
+## If terraform should create a new GKE cluster, fill in this section as well.
+##    By default, a public autopilot GKE cluster will be created in the default network.
+##    Set the autopilot_cluster variable to false to create a standard cluster instead.
+create_cluster    = false
+autopilot_cluster = true
+
 #######################################################
 ####    APPLICATIONS
 #######################################################

diff --git a/infrastructure/README.md b/infrastructure/README.md
@@ -1,6 +1,6 @@
 # Setup Infra
 
-Platform module (to be renamed to Infra), creates the GKE cluster & other related resources for the AI applications / workloads to be deployed on them. 
+The infrastructure module creates the GKE cluster and other related resources for the AI applications / workloads to be deployed on them. 
 
 1) Update the ```platform.tfvars``` file with the required configuration. Kindly refer to ```tfvars_examples``` for sample configuration.
 
@@ -21,12 +21,13 @@ Following service APIs are enabled,
 
 if not already enabled, use the following command:
 ```
-gcloud services enable container.googleapis.com gkehub.googleapis.com
+gcloud services enable container.googleapis.com gkehub.googleapis.com \
+  servicenetworking.googleapis.com cloudresourcemanager.googleapis.com
 ```
 ## Network Connectivity
 
 ### Private GKE Cluster with internal endpoint
-Default config in ```platform.tfvars``` creates a private GKE cluster with internal endpoints & cluster is added to project-scoped Anthos fleet.
+The default configuration in ```platform.tfvars``` creates a private GKE cluster with internal endpoints and adds the cluster to a project-scoped Anthos fleet.
 For admin access to cluster, Anthos Connect Gateway is used. 
 
 ### Private GKE Cluster with external endpoint

diff --git a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars
@@ -58,7 +58,7 @@ gpu_pools = [{
   min_count          = 2
   max_count          = 3
   accelerator_count  = 2
-  disk_size_gb       = 100
+  disk_size_gb       = 200
   enable_gcfs        = true
   logging_variant    = "DEFAULT"
   disk_type          = "pd-balanced"

diff --git a/tutorials-and-examples/hf-tgi/main.tf b/tutorials-and-examples/hf-tgi/main.tf
@@ -47,6 +47,9 @@ resource "kubernetes_service" "inference_service" {
 }
 
 resource "kubernetes_deployment" "inference_deployment" {
+  timeouts {
+    create = "30m"
+  }
   metadata {
     name      = "mistral-7b-instruct"
     namespace = var.namespace
@@ -56,7 +59,11 @@ resource "kubernetes_deployment" "inference_deployment" {
   }
 
   spec {
-    replicas = 1
+    # It takes more than 10m for the deployment to be ready on Autopilot cluster
+    # Set the progress deadline to 30m to avoid the deployment controller
+    # considering the deployment to be failed
+    progress_deadline_seconds = 1800
+    replicas                  = 1
 
     selector {
       match_labels = merge({
@@ -72,6 +79,15 @@ resource "kubernetes_deployment" "inference_deployment" {
       }
 
       spec {
+        init_container {
+          name    = "download-model"
+          image   = "google/cloud-sdk:473.0.0-alpine"
+          command = ["gsutil", "cp", "-r", "gs://vertex-model-garden-public-us/mistralai/Mistral-7B-Instruct-v0.1/", "/model-data/"]
+          volume_mount {
+            mount_path = "/model-data"
+            name       = "model-storage"
+          }
+        }
         container {
           image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
           name  = "mistral-7b-instruct"
@@ -82,9 +98,11 @@ resource "kubernetes_deployment" "inference_deployment" {
             protocol       = "TCP"
           }
 
+          args = ["--model-id", "$(MODEL_ID)"]
+
           env {
             name  = "MODEL_ID"
-            value = "mistralai/Mistral-7B-Instruct-v0.1"
+            value = "/model/Mistral-7B-Instruct-v0.1"
           }
 
           env {
@@ -118,6 +136,12 @@ resource "kubernetes_deployment" "inference_deployment" {
             name       = "data"
           }
 
+          volume_mount {
+            mount_path = "/model"
+            name       = "model-storage"
+            read_only  = "true"
+          }
+
           #liveness_probe {
           #http_get {
           #path = "/"
@@ -146,6 +170,11 @@ resource "kubernetes_deployment" "inference_deployment" {
           empty_dir {}
         }
 
+        volume {
+          name = "model-storage"
+          empty_dir {}
+        }
+
         node_selector = merge({
           "cloud.google.com/gke-accelerator" = "nvidia-l4"
           }, var.autopilot_cluster ? {