Skip to content

Commit

Permalink
chore: More scalability for our metrics (#11732)
Browse files Browse the repository at this point in the history
This PR configures more resources for metrics
  • Loading branch information
PhilWindle authored Feb 4, 2025
1 parent 29ee9aa commit 52bbf14
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 3 deletions.
28 changes: 25 additions & 3 deletions spartan/metrics/values/prod.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
opentelemetry-collector:
replicaCount: 3
resources:
requests:
memory: 12Gi
cpu: "1.5"
nodeSelector:
node-type: infra
pool: spot
tolerations:
- key: "cloud.google.com/gke-spot"
operator: "Equal"
value: "true"
effect: "NoSchedule"
ports:
jaeger-compact:
enabled: false
Expand Down Expand Up @@ -29,14 +40,21 @@ prometheus:
server:
resources:
requests:
memory: 7Gi
cpu: 1.5
memory: 26Gi
cpu: "3.5"
nodeSelector:
node-type: infra
pool: spot
tolerations:
- key: "cloud.google.com/gke-spot"
operator: "Equal"
value: "true"
effect: "NoSchedule"

persistentVolume:
enabled: true
size: 100Gi
replicaCount: 10
replicaCount: 3
statefulSet:
enabled: true
alertmanager:
Expand All @@ -57,6 +75,10 @@ tempo:

# https://artifacthub.io/packages/helm/grafana/grafana
grafana:
resources:
requests:
memory: 5Gi
cpu: "1.5"
nodeSelector:
node-type: infra
service:
Expand Down
90 changes: 90 additions & 0 deletions spartan/terraform/gke-cluster/cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,93 @@ resource "google_container_node_pool" "spot_nodes_2core" {
auto_upgrade = false
}
}

# Create 2 core high memory spot instance node pool with autoscaling, used for metrics
resource "google_container_node_pool" "spot_nodes_2core-highmem" {
name = "${var.cluster_name}-2core-highmem-spot"
location = var.zone
cluster = var.cluster_name
version = var.node_version
# Enable autoscaling
autoscaling {
min_node_count = 0
max_node_count = 8
}

# Node configuration
node_config {
machine_type = "n2-highmem-2"
spot = true

service_account = var.service_account
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]

labels = {
env = "production"
pool = "spot"
local-ssd = "false"
node-type = "infra"
}
tags = ["aztec-gke-node", "spot"]

# Spot instance termination handler
taint {
key = "cloud.google.com/gke-spot"
value = "true"
effect = "NO_SCHEDULE"
}
}

# Management configuration
management {
auto_repair = true
auto_upgrade = false
}
}

# Create 4 core high memory spot instance node pool with autoscaling, used for metrics
resource "google_container_node_pool" "spot_nodes_4core-highmem" {
name = "${var.cluster_name}-4core-highmem-spot"
location = var.zone
cluster = var.cluster_name
version = var.node_version
# Enable autoscaling
autoscaling {
min_node_count = 0
max_node_count = 8
}

# Node configuration
node_config {
machine_type = "n2-highmem-4"
spot = true

service_account = var.service_account
oauth_scopes = [
"https://www.googleapis.com/auth/cloud-platform"
]

labels = {
env = "production"
pool = "spot"
local-ssd = "false"
node-type = "infra"
}
tags = ["aztec-gke-node", "spot"]

# Spot instance termination handler
taint {
key = "cloud.google.com/gke-spot"
value = "true"
effect = "NO_SCHEDULE"
}
}

# Management configuration
management {
auto_repair = true
auto_upgrade = false
}
}

0 comments on commit 52bbf14

Please sign in to comment.