chore: More scalability for our metrics (#11732)

This PR configures more resources for metrics
AztecProtocol · Feb 4, 2025 · 52bbf14 · 52bbf14
1 parent 29ee9aa
commit 52bbf14
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 3 deletions.
diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml
@@ -1,6 +1,17 @@
 opentelemetry-collector:
+  replicaCount: 3
+  resources:
+    requests:
+      memory: 12Gi
+      cpu: "1.5"
   nodeSelector:
     node-type: infra
+    pool: spot
+  tolerations:
+    - key: "cloud.google.com/gke-spot"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
   ports:
     jaeger-compact:
       enabled: false
@@ -29,14 +40,21 @@ prometheus:
   server:
     resources:
       requests:
-        memory: 7Gi
-        cpu: 1.5
+        memory: 26Gi
+        cpu: "3.5"
     nodeSelector:
       node-type: infra
+      pool: spot
+    tolerations:
+      - key: "cloud.google.com/gke-spot"
+        operator: "Equal"
+        value: "true"
+        effect: "NoSchedule"
+
     persistentVolume:
       enabled: true
       size: 100Gi
-    replicaCount: 10
+    replicaCount: 3
     statefulSet:
       enabled: true
   alertmanager:
@@ -57,6 +75,10 @@ tempo:
 
 # https://artifacthub.io/packages/helm/grafana/grafana
 grafana:
+  resources:
+    requests:
+      memory: 5Gi
+      cpu: "1.5"
   nodeSelector:
     node-type: infra
   service:

diff --git a/spartan/terraform/gke-cluster/cluster/main.tf b/spartan/terraform/gke-cluster/cluster/main.tf
@@ -273,3 +273,93 @@ resource "google_container_node_pool" "spot_nodes_2core" {
     auto_upgrade = false
   }
 }
+
+# Create 2 core high memory spot instance node pool with autoscaling, used for metrics
+resource "google_container_node_pool" "spot_nodes_2core-highmem" {
+  name     = "${var.cluster_name}-2core-highmem-spot"
+  location = var.zone
+  cluster  = var.cluster_name
+  version  = var.node_version
+  # Enable autoscaling
+  autoscaling {
+    min_node_count = 0
+    max_node_count = 8
+  }
+
+  # Node configuration
+  node_config {
+    machine_type = "n2-highmem-2"
+    spot         = true
+
+    service_account = var.service_account
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+
+    labels = {
+      env       = "production"
+      pool      = "spot"
+      local-ssd = "false"
+      node-type = "infra"
+    }
+    tags = ["aztec-gke-node", "spot"]
+
+    # Spot instance termination handler
+    taint {
+      key    = "cloud.google.com/gke-spot"
+      value  = "true"
+      effect = "NO_SCHEDULE"
+    }
+  }
+
+  # Management configuration
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+}
+
+# Create 4 core high memory spot instance node pool with autoscaling, used for metrics
+resource "google_container_node_pool" "spot_nodes_4core-highmem" {
+  name     = "${var.cluster_name}-4core-highmem-spot"
+  location = var.zone
+  cluster  = var.cluster_name
+  version  = var.node_version
+  # Enable autoscaling
+  autoscaling {
+    min_node_count = 0
+    max_node_count = 8
+  }
+
+  # Node configuration
+  node_config {
+    machine_type = "n2-highmem-4"
+    spot         = true
+
+    service_account = var.service_account
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/cloud-platform"
+    ]
+
+    labels = {
+      env       = "production"
+      pool      = "spot"
+      local-ssd = "false"
+      node-type = "infra"
+    }
+    tags = ["aztec-gke-node", "spot"]
+
+    # Spot instance termination handler
+    taint {
+      key    = "cloud.google.com/gke-spot"
+      value  = "true"
+      effect = "NO_SCHEDULE"
+    }
+  }
+
+  # Management configuration
+  management {
+    auto_repair  = true
+    auto_upgrade = false
+  }
+}