diff --git a/docs/admin/observability/collecting-metrics/collecting-metrics.md b/docs/admin/observability/collecting-metrics/collecting-metrics.md index 5f64c1aef28..62713827baa 100644 --- a/docs/admin/observability/collecting-metrics/collecting-metrics.md +++ b/docs/admin/observability/collecting-metrics/collecting-metrics.md @@ -1,6 +1,68 @@ -# Collecting Metrics with OpenTelemetry +# Collecting Metrics in Knative -You can set up the [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) to receive metrics from Knative components and distribute them to Prometheus. +Knative supports different popular tools for collecting metrics: +- [Prometheus](https://prometheus.io/) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) + +[Grafana](https://grafana.com/oss/) dashboards are available for metrics collected directly with Prometheus. + +You can also set up the OpenTelemetry Collector to receive metrics from Knative components and distribute them to other metrics providers that support OpenTelemetry. + +## About Prometheus + +[Prometheus](https://prometheus.io/) is an open-source tool for collecting, +aggregating timeseries metrics and alerting. It can also be used to scrape the OpenTelemetry Collector that is demonstrated below when Prometheus is used. + +## Setting up Prometheus + +1. Install the [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator/helm) by using [Helm](https://helm.sh/docs/intro/using_helm/): + + ```bash + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + helm install prometheus prometheus-community/kube-prometheus-stack -n default -f values.yaml + # values.yaml contains at minimum the configuration below + ``` + + !!! caution + You will need to ensure that the helm chart has following values configured, otherwise the ServiceMonitors/Podmonitors will not work. + ```yaml + kube-state-metrics: + metricLabelsAllowlist: + - pods=[*] + - deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance] + prometheus: + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + ``` + +1. Apply the ServiceMonitors/PodMonitors to collect metrics from Knative. + + ```bash + kubectl apply -f https://raw.githubusercontent.com/knative-sandbox/monitoring/main/servicemonitor.yaml + ``` +1. Grafana dashboards can be imported from https://github.com/knative-sandbox/monitoring/tree/main/grafana. + +1. If you are using the Grafana Helm Chart with the Dashboard Sidecar configured, you can load the dashboards by applying the following configmap. + + ```bash + kubectl apply -f https://raw.githubusercontent.com/knative-sandbox/monitoring/main/grafana/dashboards.yaml + ``` + +### Access the Prometheus instance locally + +By default, the Prometheus instance is only exposed on a private service named `prometheus-operated`. + +To access the console in your web browser: + +1. Enter the command: + + ```bash + kubectl port-forward -n default svc/prometheus-operated 9090 + ``` + +1. Access the console in your browser via `http://localhost:9090`. ## About OpenTelemetry @@ -19,6 +81,9 @@ In the following example, you can configure a single collector instance using a !!! tip For more complex deployments, you can automate some of these steps by using the [OpenTelemetry Operator](https://github.com/open-telemetry/opentelemetry-operator). +!!! caution + The Grafana dashboards at https://github.com/knative-sandbox/monitoring/tree/main/grafana don't work with metrics scraped from OpenTelemetry Collector. + ![Diagram of components reporting to collector, which is scraped by Prometheus](system-diagram.svg) - -### Make the Prometheus instance public - -By default, the Prometheus instance is only exposed on a private service named `prometheus-operated`. - -To access the console in your web browser: - -1. Enter the command: - - ```bash - kubectl port-forward --namespace metrics service/prometheus-operated 9090 - ``` - -1. Access the console in your browser via `http://localhost:9090`. diff --git a/docs/admin/observability/collecting-metrics/prometheus.yaml b/docs/admin/observability/collecting-metrics/prometheus.yaml deleted file mode 100644 index 3b6695daac7..00000000000 --- a/docs/admin/observability/collecting-metrics/prometheus.yaml +++ /dev/null @@ -1,144 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: metrics ---- -# Note: For general cluster use, you may want to use a ClusteRole and -# ClusterRoleBinding to grant Prometheus the ability to list all services and -# pods in the cluster. For this use case, we only need to grant access to the -# same namespace, and can use a Role and RoleBinding. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: watch-services-and-pods - namespace: metrics -rules: -- apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prom-watch-services-and-pods - namespace: metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: watch-services-and-pods -subjects: - - kind: ServiceAccount - name: prometheus ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prom-config - namespace: metrics -data: - prometheus.yaml: | - global: - scrape_interval: 30s - scrape_timeout: 10s - evaluation_interval: 30s - - rule_files: - - /etc/prometheus/config/prometheus-rules-*.yaml - - scrape_configs: - - job_name: otel-collector - honor_labels: true - honor_timestamps: true - metrics_path: /metrics - # Note that we *don't want* to use relabel to collect labels here, - # because these are the labels of the opentelemetry collector. - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_label_app] - regex: otel-export - - action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: prom-export - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - metrics - prometheus-rules-example.yaml: | - groups: - - name: example - rules: - - record: pod:http_requests:irate5m - expr: label_replace(rate(knative_dev_internal_serving_revision_app_request_latencies_count[5m]), "service", "$1", "pod_name", "(.*)-deployment-.+-.+") - - record: service:http_requests:irate5m - expr: sum(pod:http_requests:irate5m) by (service) - - record: pod:http_latency:buckets5m - expr: sum(label_replace(rate(knative_dev_internal_serving_revision_app_request_latencies_bucket[5m]), "service", "$1", "pod_name", "(.*)-deployment-.+-.+")) by (pod_name,service,le) - - record: service:http_latency:buckets5m - expr: sum by (service,le)(pod:http_latency:buckets5m) / ignoring(le) group_left service:http_requests:irate5m ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus - namespace: metrics -spec: - selector: - matchLabels: - app: prometheus - replicas: 1 # Each replica will hold all data in memory. - template: - metadata: - labels: - app: prometheus - spec: - containers: - - name: prometheus - image: quay.io/prometheus/prometheus - args: - - --config.file=/etc/prometheus/config/prometheus.yaml - - --storage.tsdb.path=/prometheus - - --storage.tsdb.retention.time=24h - - --storage.tsdb.no-lockfile - - --web.console.templates=/etc/prometheus/consoles - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.enable-admin-api - - --web.enable-lifecycle - - --web.route-prefix=/ - resources: - # This is a small sizing; adjust as needed for your environment. - requests: - memory: 200Mi - cpu: 50m - ports: - - name: ui - containerPort: 9090 - volumeMounts: - - name: config - mountPath: etc/prometheus/config - - name: prometheus-emptydir - mountPath: /prometheus - volumes: - - name: config - configMap: - name: prom-config - - name: prometheus-emptydir - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus - namespace: metrics -spec: - selector: - app: prometheus - ports: - - name: ui - port: 9090 - targetPort: 9090