Merge pull request #54 from stakater/alerting

Add alerting and monitoring
stakater · May 24, 2023 · da18fe8 · da18fe8
2 parents e39b6c3 + 2b8fb30
commit da18fe8
Show file tree

Hide file tree

Showing 8 changed files with 407 additions and 1 deletion.
diff --git a/content/for-developers/add-grafana-dashboard-for-your-application.md b/content/for-developers/add-grafana-dashboard-for-your-application.md
@@ -1 +1,176 @@
-# Add Grafana dashboard for your Application
+# Create Grafana Dashboard Guide
+
+This document explains how to create Grafana Dashboard via GrafanaDashboard CR powered by Grafana operator. In this way there is no need to configure/import it via web UI. Now, developers can ship new dashboards in any namespace and deploy them via GitOps. If Dashboard definition/json is invalid, the dashboard will not appear in Grafana web UI.
+
+## Prerequisite
+
+- Grafana should be up and running. You should be able to find it under Workload Monitoring category in your Forecastle link.
+
+## Instructions
+
+1. Create GrafanaDashboard definition
+
+   You can choose any existing namespace. The label `grafanaDashboard: grafana-operator` is required for Grafana Operator to discover the dashboard. The JSON string with the dashboard contents is placed in the “json” section. Check the [official documentation](https://grafana.com/docs/reference/dashboard/#dashboard-json) for details on JSON Model. You can create a dashboard via web UI first, and then export the dashboard json to define a GrafanaDashboard CR (Custom Resource). If the JSON is invalid, the dashboard will not appear in Grafana web UI. Below is a sample GrafanaDashboard CR yaml.
+
+    ```yaml
+    apiVersion: integreatly.org/v1alpha1
+    kind: GrafanaDashboard
+    metadata:
+      name: grafana-dashboard-example
+      namespace: test-ns
+      labels:
+        grafanaDashboard: grafana-operator
+    spec:
+      json: |-
+        {
+          "annotations": {
+            "list": [
+              {
+                "builtIn": 1,
+                "datasource": "-- Grafana --",
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+              }
+            ]
+          },
+          "editable": true,
+          "gnetId": null,
+          "graphTooltip": 0,
+          "id": null,
+          "links": [],
+          "panels": [
+            {
+              "aliasColors": {},
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": null,
+              "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+              },
+              "fill": 1,
+              "fillGradient": 0,
+              "gridPos": {
+                "h": 9,
+                "w": 12,
+                "x": 0,
+                "y": 0
+              },
+              "hiddenSeries": false,
+              "id": 2,
+              "legend": {
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "nullPointMode": "null",
+              "options": {
+                "alertThreshold": true
+              },
+              "percentage": false,
+              "pluginVersion": "7.5.11",
+              "pointradius": 2,
+              "points": false,
+              "renderer": "flot",
+              "seriesOverrides": [],
+              "spaceLength": 10,
+              "stack": false,
+              "steppedLine": false,
+              "thresholds": [],
+              "timeFrom": null,
+              "timeRegions": [],
+              "timeShift": null,
+              "title": "Panel Title",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+              },
+              "yaxes": [
+                {
+                  "format": "short",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": null,
+                  "show": true
+                },
+                {
+                  "format": "short",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": null,
+                  "show": true
+                }
+              ],
+              "yaxis": {
+                "align": false,
+                "alignLevel": null
+              }
+            }
+          ],
+          "schemaVersion": 27,
+          "style": "dark",
+          "tags": [],
+          "templating": {
+            "list": []
+          },
+          "time": {
+            "from": "now-6h",
+            "to": "now"
+          },
+          "timepicker": {},
+          "timezone": "",
+          "title": "New dashboard",
+          "uid": null,
+          "version": 0
+        }
+    ```
+
+1. Apply GrafanaDashbaord CR to your cluster, manually or via GitOps way
+
+1. View the dashboard via Grafana web UI
+
+    1. Under `stakater-workload-monitoring` project, find the URL to Grafana through Cluster menu [Networking]->[Routes]
+    1. Open Grafana web UI, and go to [Dashboards]->[Manage] to view Dashboards management page
+       ![Grafana-menu](./images/grafana-menu.png)
+
+    1. Your dashboard will be put in a folder named after the namespace that you specified in GrafanaDashboard definition. The following is an example. ![Grafana-dashboards-management](./images/grafana-dashboards-management.png)
+
+## Application chart
+
+A template for installing GrafanaDashboard is provided by [application chart](https://github.com/stakater-charts/application). You are able to specify configuration via `values.yaml` and install dashboards on cluster. The following is an example.
+
+```yaml
+grafanaDashboard:
+  enabled: true
+  additionalLabels:
+    test-label: chart
+  annotations: 
+    test-annoation: chart
+  contents:
+    dashboard-test-name-1: 
+      json: |-
+        {
+          ...
+        }
+```
diff --git a/content/for-developers/enable-alerts-for-your-application.md b/content/for-developers/enable-alerts-for-your-application.md
@@ -1 +1,120 @@
 # Enable alerts for your Application
+
+Now that we have enabled metrics for our application the previous section, let's create alerts for it.
+
+Metrics endpoints are scraped via ServiceMonitor by Prometheus
+
+## Metrics endpoints are scraped via ServiceMonitor by Prometheus
+
+The Prometheus Operator includes a Custom Resource Definition that allows the definition of the ServiceMonitor. The ServiceMonitor is used to define an application you wish to scrape metrics from, the controller will action the ServiceMonitors we define and automatically build the required Prometheus configuration.
+
+Example ServiceMonitor:
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: example-svc-monitor
+  namespace: example-namespace
+spec:
+  endpoints:
+  - interval: 30s
+    path: /metrics
+    port: metrics
+  selector:
+    matchLabels:
+      app: example-svc-label
+```
+
+### Defining PrometheusRule CustomResource
+
+PrometheusRule CustomResource will define rules to generate an alert if the metrics values go below/up a certain value (depends on the use case).
+
+The Template for the File is as follows:
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    prometheus: stakater-workload-monitoring
+    role: alert-rules
+  name: <NAME_OF_PROMETHEUSRULE>
+  namespace: stakater-workload-monitoring
+spec:
+  groups:
+  - name: <GROUP_NAME> 
+    rules:
+    - alert: <ALERT_NAME>
+      annotations:
+        message: >-
+          <MESSAGE_TO_BE_DISPLAYED>
+      expr: | 
+          <EXPRESSION_TO_BE_EVALUATED_FOR_ALERT>
+      labels:
+        severity: <SEVERITY>
+        namespace: < NAME_OF_NAMESPACE >
+```
+
+Following Example shows Alerts for PersistentVolumes on the metrics scraped from Kubelets
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ labels:
+   prometheus: stakater-workload-monitoring
+   role: alert-rules
+ name: prometheus-workload-rules
+ namespace: stakater-workload-monitoring
+spec:
+ groups:
+   - name: kubernetes-storage
+     rules:
+       - alert: KubePersistentVolumeUsageCritical
+         annotations:
+           message: >-
+             The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
+             }} in Namespace {{ $labels.namespace }} is only {{ $value |
+             humanizePercentage }} free.
+         expr: >-
+           kubelet_volume_stats_available_bytes{namespace!~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
+             /
+           kubelet_volume_stats_capacity_bytes{namespace!~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
+             < 0.03
+         for: 1m
+         labels:
+           severity: critical
+       - alert: KubePersistentVolumeFullInFourDays
+         annotations:
+           message: >-
+             Based on recent sampling, the PersistentVolume claimed by {{
+             $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace
+             }} is expected to fill up within four days. Currently {{ $value |
+             humanizePercentage }} is available.
+         expr: >-
+           (
+             kubelet_volume_stats_available_bytes{namespace!~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
+               /
+             kubelet_volume_stats_capacity_bytes{namespace!~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
+           ) < 0.15
+ 
+           and
+
+           predict_linear(kubelet_volume_stats_available_bytes{namespace!~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}[6h],
+           4 * 24 * 3600) < 0
+         for: 1h
+         labels:
+           severity: critical
+       - alert: KubePersistentVolumeErrors
+         annotations:
+           message: >-
+             The persistent volume {{ $labels.persistentvolume }} has status {{
+             $labels.phase }}.
+         expr: >-
+          kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace!~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
+           > 0
+         for: 5m
+         labels:
+           severity: critical
+```
diff --git a/content/for-developers/enable-logging-for-your-application.md b/content/for-developers/enable-logging-for-your-application.md
@@ -1 +1,17 @@
 # Enable logging for your Application
+
+Logging is enabled by default for all applications running to the cluster. To see these logs, head over to Kibana through Forecastle.
+
+## How to access logs on Kibana?
+
+Below are the steps to access logs on Kibana on your first login.
+
+Go to `Management` > `Index-Patterns` and you will see the below screen, enter `*` in the `Index Pattern` text box and click `Next step`:
+
+![Kibana_Page_1](./images/kibana_index_page1.png)
+
+From the drop-down `Time filter` field name select `@timestamp` and click `Create index pattern`:
+
+![Kibana_Page_2](./images/kibana_index_page2.png)
+
+Now you can go to `Discover` tab to view the logs.