Skip to content

Commit

Permalink
Merge pull request #345 from DFE-Digital/889-monitor-node-pool-cpu-me…
Browse files Browse the repository at this point in the history
…mory-pressure2

Adding Azure monitor metrics to TSC repo for node availability
  • Loading branch information
shaheislamdfe authored Jan 10, 2025
2 parents 8c39bb5 + d63a505 commit 52bb85b
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
RG_TAGS={"Product" : "Teacher services cloud"}
ARM_TEMPLATE_TAG=1.1.0
SERVICE_NAME=teacher-services-cloud
SERVICE_SHORT=tsc

ci:
$(eval AUTO_APPROVE=-auto-approve)
Expand Down Expand Up @@ -173,3 +175,8 @@ import-aks-resources: get-cluster-credentials
.PHONY: new_service
new_service:
bash templates/new_service.sh

action-group: set-azure-account # make production action-group [email protected] . Must be run before setting enable_monitoring=true. Use any non-prod environment to create in the test subscription.
$(if $(ACTION_GROUP_EMAIL), , $(error Please specify a notification email for the action group))
az group create -l uksouth -g ${RESOURCE_PREFIX}-${SERVICE_SHORT}-mn-rg --tags "Product=${SERVICE_NAME}"
az monitor action-group create -n ${RESOURCE_PREFIX}-${SERVICE_SHORT} -g ${RESOURCE_PREFIX}-${SERVICE_SHORT}-mn-rg --action email ${RESOURCE_PREFIX}-${SERVICE_SHORT}-email ${ACTION_GROUP_EMAIL}
33 changes: 33 additions & 0 deletions cluster/terraform_aks_cluster/azure_metric_alerts.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
data "azurerm_monitor_action_group" "main" {
resource_group_name = local.monitoring_resource_group
name = local.monitor_action_group_name
}

resource "azurerm_monitor_metric_alert" "node_availability" {
name = "${var.resource_prefix}-tsc-${var.environment}-nodes-capacity"
resource_group_name = local.monitoring_resource_group
scopes = [azurerm_kubernetes_cluster.main.id]
description = "Action will be triggered when number of available nodes is less than ${local.required_available_nodes}"
window_size = "PT5M"
frequency = "PT1M"

criteria {
metric_namespace = "microsoft.containerservice/managedclusters"
metric_name = "kube_node_status_condition"
aggregation = "Average"
operator = "GreaterThan"
threshold = local.node_threshold

dimension {
name = "status2" # References the second status dimension available in the monitor metric
operator = "Include"
values = ["Ready"]
}
}

action {
action_group_id = data.azurerm_monitor_action_group.main.id
}

lifecycle { ignore_changes = [tags] }
}
4 changes: 4 additions & 0 deletions cluster/terraform_aks_cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,8 @@ locals {
"privatelink.redis.cache.windows.net"
]
uk_south_availability_zones = ["1", "2", "3"]
monitor_action_group_name = "${var.resource_prefix}-tsc"
monitoring_resource_group = "${var.resource_prefix}-tsc-mn-rg"
required_available_nodes = 2
node_threshold = var.node_pools["apps1"].max_count + var.default_node_pool.node_count - local.required_available_nodes
}
22 changes: 22 additions & 0 deletions documentation/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,25 @@ Following auth keys need to be stored on azure vault as a secret.
1. PROMETHEUS-AUTH
2. ALERTMANAGER-AUTH
3. THANOS-AUTH

### Azure Monitor Alerting

Azure Monitor is used to track the health and performance of the AKS clusters. The monitoring is configured through Terraform in the `azure_metric_alerts.tf` file.

#### Node Availability Monitoring

A metric alert is configured to monitor the availability of nodes in the AKS cluster:

- Alert Name: `[resource-prefix]-tsc-[environment]-nodes-capacity`
- Metric: `kube_node_status_condition`
- Evaluation: Every 1 minute over a 5-minute window
- Threshold: Triggers when the number of available nodes with "Ready" status exceeds the configured threshold
- Action: Notifications are sent to the configured Azure Monitor Action Group

The alert helps ensure the cluster maintains sufficient node capacity for workloads. The action group is configured to notify the appropriate team members when node availability issues are detected.

Configuration is managed through Terraform variables:
- The monitoring resource group and action group are defined in the cluster configuration
- The action group name follows the format `[resource-prefix]-tsc`
- Alert thresholds can be customized per environment
- The metric namespace used is `microsoft.containerservice/managedclusters`

0 comments on commit 52bb85b

Please sign in to comment.