diff --git a/docs/eks/eks-apiserver.md b/docs/eks/eks-apiserver.md new file mode 100644 index 00000000..ec1bd52a --- /dev/null +++ b/docs/eks/eks-apiserver.md @@ -0,0 +1,23 @@ +# Monitoring EKS API server + +AWS Distro of OpenTelemetry enables EKS API server monitoring by default and provides three Grafana dashboards: + +## Kube-apiserver (basic) + +The basic dashboard shows metrics recommended in [EKS Best Practices Guides - Monitor Control Plane Metrics](https://aws.github.io/aws-eks-best-practices/reliability/docs/controlplane/#monitor-control-plane-metrics) and provides request rate and latency for API server, latency for ETCD server and overall workqueue sercice time and latency. It allows a drill-down per API server. + +![image](https://github.com/youwalther65/terraform-aws-observability-accelerator/assets/29410195/9dcf2583-6630-4d3c-911d-8ca48ae2d26f) + +## Kube-apiserver (advanced) + +The advanced dashboard is derived from kube-prometheus-stack "Kubernetes / API server" dashboard and provides a detailed metrics drill-down for example per READ and WRITE operations per component (like deployments, configmaps etc.). + +![image](https://github.com/youwalther65/terraform-aws-observability-accelerator/assets/29410195/e76a6357-461f-416d-8bf0-5b7777848bea) + +## Kube-apiserver (troubleshooting) + +This dashboards can be used to troubleshoot API server problems like latency, errors etc. + +A detailed description for usage and background information regarding the dashboard can be found in AWS Containers blog post [Troubleshooting Amazon EKS API servers with Prometheus](https://aws.amazon.com/blogs/containers/troubleshooting-amazon-eks-api-servers-with-prometheus/). + +![image](https://github.com/youwalther65/terraform-aws-observability-accelerator/assets/29410195/921d3453-dcda-4d8a-8223-7c02f1f08ee2) diff --git a/examples/existing-cluster-with-base-and-infra/main.tf b/examples/existing-cluster-with-base-and-infra/main.tf index a4024364..eccd94a1 100644 --- a/examples/existing-cluster-with-base-and-infra/main.tf +++ b/examples/existing-cluster-with-base-and-infra/main.tf @@ -67,6 +67,9 @@ module "eks_monitoring" { # reusing existing certificate manager? defaults to true enable_cert_manager = true + # enable EKS API server monitoring + enable_apiserver_monitoring = true + # deploys external-secrets in to the cluster enable_external_secrets = true grafana_api_key = var.grafana_api_key diff --git a/mkdocs.yml b/mkdocs.yml index ddbcb4ee..bb5314e3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - Concepts: concepts.md - Amazon EKS: - Infrastructure monitoring: eks/index.md + - EKS API server monitoring: eks/eks-apiserver.md - Multicluster monitoring: eks/multicluster.md - Java/JMX: eks/java.md - Nginx: eks/nginx.md diff --git a/modules/eks-monitoring/README.md b/modules/eks-monitoring/README.md index 5c0a8687..c502b521 100644 --- a/modules/eks-monitoring/README.md +++ b/modules/eks-monitoring/README.md @@ -71,6 +71,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [eks\_cluster\_id](#input\_eks\_cluster\_id) | EKS Cluster Id | `string` | n/a | yes | | [enable\_alerting\_rules](#input\_enable\_alerting\_rules) | Enables or disables Managed Prometheus alerting rules | `bool` | `true` | no | | [enable\_amazon\_eks\_adot](#input\_enable\_amazon\_eks\_adot) | Enables the ADOT Operator on the EKS Cluster | `bool` | `true` | no | +| [enable\_apiserver\_monitoring](#input\_enable\_apiserver\_monitoring) | Enable EKS kube-apiserver monitoring, alerting and dashboards | `bool` | `true` | no | | [enable\_cert\_manager](#input\_enable\_cert\_manager) | Allow reusing an existing installation of cert-manager | `bool` | `true` | no | | [enable\_custom\_metrics](#input\_enable\_custom\_metrics) | Allows additional metrics collection for config elements in the `custom_metrics_config` config object. Automatic dashboards are not included | `bool` | `false` | no | | [enable\_dashboards](#input\_enable\_dashboards) | Enables or disables curated dashboards | `bool` | `true` | no | @@ -93,6 +94,9 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [flux\_kustomization\_path](#input\_flux\_kustomization\_path) | Flux Kustomization Path | `string` | `"./artifacts/grafana-operator-manifests/eks/infrastructure"` | no | | [go\_config](#input\_go\_config) | Grafana Operator configuration |
object({
create_namespace = bool
helm_chart = string
helm_name = string
k8s_namespace = string
helm_release_name = string
helm_chart_version = string
})
|
{
"create_namespace": true,
"helm_chart": "oci://ghcr.io/grafana-operator/helm-charts/grafana-operator",
"helm_chart_version": "v5.0.0-rc3",
"helm_name": "grafana-operator",
"helm_release_name": "grafana-operator",
"k8s_namespace": "grafana-operator"
}
| no | | [grafana\_api\_key](#input\_grafana\_api\_key) | Grafana API key for the Amazon Managed Grafana workspace. Required if `enable_external_secrets = true` | `string` | `""` | no | +| [grafana\_apiserver\_advanced\_dashboard\_url](#input\_grafana\_apiserver\_advanced\_dashboard\_url) | Dashboard URL for Kube-apiserver (advanced) Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-advanced.json"` | no | +| [grafana\_apiserver\_basic\_dashboard\_url](#input\_grafana\_apiserver\_basic\_dashboard\_url) | Dashboard URL for Kube-apiserver (basic) Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-basic.json"` | no | +| [grafana\_apiserver\_troubleshooting\_dashboard\_url](#input\_grafana\_apiserver\_troubleshooting\_dashboard\_url) | Dashboard URL for Kube-apiserver (troubleshooting) Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-troubleshooting.json"` | no | | [grafana\_cluster\_dashboard\_url](#input\_grafana\_cluster\_dashboard\_url) | Dashboard URL for Cluster Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/cluster.json"` | no | | [grafana\_kubelet\_dashboard\_url](#input\_grafana\_kubelet\_dashboard\_url) | Dashboard URL for Kubelet Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/kubelet.json"` | no | | [grafana\_namespace\_workloads\_dashboard\_url](#input\_grafana\_namespace\_workloads\_dashboard\_url) | Dashboard URL for Namespace Workloads Grafana Dashboard JSON | `string` | `"https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json"` | no | diff --git a/modules/eks-monitoring/dashboards.tf b/modules/eks-monitoring/dashboards.tf index f8260e3a..ad106833 100644 --- a/modules/eks-monitoring/dashboards.tf +++ b/modules/eks-monitoring/dashboards.tf @@ -37,6 +37,9 @@ spec: AMP_ENDPOINT_URL: ${var.managed_prometheus_workspace_endpoint} AMG_ENDPOINT_URL: ${var.grafana_url} GRAFANA_CLUSTER_DASH_URL: ${var.grafana_cluster_dashboard_url} + GRAFANA_APISERVER_BASIC_DASH_URL: ${var.grafana_apiserver_basic_dashboard_url} + GRAFANA_APISERVER_ADVANCED_DASH_URL: ${var.grafana_apiserver_advanced_dashboard_url} + GRAFANA_APISERVER_TROUBLESHOOTING_DASH_URL: ${var.grafana_apiserver_troubleshooting_dashboard_url} GRAFANA_KUBELET_DASH_URL: ${var.grafana_kubelet_dashboard_url} GRAFANA_NSWRKLDS_DASH_URL: ${var.grafana_namespace_workloads_dashboard_url} GRAFANA_NODEEXP_DASH_URL: ${var.grafana_node_exporter_dashboard_url} diff --git a/modules/eks-monitoring/main.tf b/modules/eks-monitoring/main.tf index 5f5dcd61..f0bb92f3 100644 --- a/modules/eks-monitoring/main.tf +++ b/modules/eks-monitoring/main.tf @@ -153,6 +153,10 @@ module "helm_addon" { name = "javaPrometheusMetricsEndpoint" value = try(var.java_config.prometheus_metrics_endpoint, local.java_pattern_config.prometheus_metrics_endpoint) }, + { + name = "enableAPIserver" + value = var.enable_apiserver_monitoring + }, { name = "enableNginx" value = var.enable_nginx diff --git a/modules/eks-monitoring/otel-config/templates/opentelemetrycollector.yaml b/modules/eks-monitoring/otel-config/templates/opentelemetrycollector.yaml index 53a2dd97..753d2639 100644 --- a/modules/eks-monitoring/otel-config/templates/opentelemetrycollector.yaml +++ b/modules/eks-monitoring/otel-config/templates/opentelemetrycollector.yaml @@ -68,24 +68,35 @@ spec: regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor - - job_name: 'kube-admin' + + {{ if .Values.enableAPIserver }} + - job_name: 'apiserver' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - - role: node + - role: endpoints relabel_configs: - - target_label: __address__ - replacement: kubernetes.default.svc.cluster.local:443 - - action: keep - regex: $K8S_NODE_NAME - source_labels: [__meta_kubernetes_node_name] + - source_labels: + [ + __meta_kubernetes_namespace, + __meta_kubernetes_service_name, + __meta_kubernetes_endpoint_port_name, + ] + action: keep + regex: default;kubernetes;https metric_relabel_configs: - action: keep source_labels: [__name__] - regex: 'apiserver_(request_duration_seconds|storage_list_duration_seconds|admission_controller_admission_duration_seconds|flowcontrol_request_wait_duration_seconds).*|apiserver_(admission_webhook_fail_open_count|tls_handshake_errors_total|request_total)|rest_client_request_duration_seconds.*|rest_client_requests_total|etcd_(request_duration_seconds|db_total_size_in_bytes).*' + - source_labels: [__name__, le] + separator: ; + regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50) + replacement: $1 + action: drop + {{ end }} + - job_name: serviceMonitor/default/kube-prometheus-stack-prometheus-node-exporter/0 honor_timestamps: true scrape_interval: {{ .Values.globalScrapeInterval }} diff --git a/modules/eks-monitoring/otel-config/values.yaml b/modules/eks-monitoring/otel-config/values.yaml index d0f91fb6..2e489543 100644 --- a/modules/eks-monitoring/otel-config/values.yaml +++ b/modules/eks-monitoring/otel-config/values.yaml @@ -6,6 +6,8 @@ accountId: ${account_id} globalScrapeTimeout: ${global_scrape_timeout} globalScrapeSampleLimit: ${global_scrape_sample_limit} +enableAPIserver: ${enable_apiserver_monitoring} + enableTracing: ${enable_tracing} otlpGrpcEndpoint: ${otlp_grpc_endpoint} otlpHttpEndpoint: ${otlp_http_endpoint} diff --git a/modules/eks-monitoring/rules.tf b/modules/eks-monitoring/rules.tf index 9bf3dfa3..6969aafa 100644 --- a/modules/eks-monitoring/rules.tf +++ b/modules/eks-monitoring/rules.tf @@ -238,5 +238,119 @@ groups: expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: job + - name: infra-rules-05 + rules: + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h + - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) + * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d + - expr: |- + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: |- + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: |- + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) + > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) + > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile EOF } diff --git a/modules/eks-monitoring/variables.tf b/modules/eks-monitoring/variables.tf index d51b24a6..1f24eb0a 100644 --- a/modules/eks-monitoring/variables.tf +++ b/modules/eks-monitoring/variables.tf @@ -201,6 +201,12 @@ variable "prometheus_config" { nullable = false } +variable "enable_apiserver_monitoring" { + description = "Enable EKS kube-apiserver monitoring, alerting and dashboards" + type = bool + default = true +} + variable "enable_tracing" { description = "Enables tracing with OTLP traces receiver to X-Ray" type = bool @@ -440,6 +446,24 @@ variable "grafana_cluster_dashboard_url" { default = "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/cluster.json" } +variable "grafana_apiserver_basic_dashboard_url" { + description = "Dashboard URL for Kube-apiserver (basic) Grafana Dashboard JSON" + type = string + default = "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-basic.json" +} + +variable "grafana_apiserver_advanced_dashboard_url" { + description = "Dashboard URL for Kube-apiserver (advanced) Grafana Dashboard JSON" + type = string + default = "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-advanced.json" +} + +variable "grafana_apiserver_troubleshooting_dashboard_url" { + description = "Dashboard URL for Kube-apiserver (troubleshooting) Grafana Dashboard JSON" + type = string + default = "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/apiserver-troubleshooting.json" +} + variable "grafana_kubelet_dashboard_url" { description = "Dashboard URL for Kubelet Grafana Dashboard JSON" type = string