Skip to content

Commit 0dd0dbd

Browse files
committed
fix cluster_id label override in mixins
Signed-off-by: QuentinBisson <[email protected]>
1 parent 3530c61 commit 0dd0dbd

15 files changed

+535
-541
lines changed
+40-40
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,41 @@
11
groups:
2-
- name: loki_alerts
3-
rules:
4-
- alert: LokiRequestErrors
5-
annotations:
6-
message: |
7-
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
8-
expr: |
9-
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
10-
/
11-
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
12-
> 10
13-
for: 15m
14-
labels:
15-
severity: critical
16-
- alert: LokiRequestPanics
17-
annotations:
18-
message: |
19-
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
20-
expr: |
21-
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
22-
labels:
23-
severity: critical
24-
- alert: LokiRequestLatency
25-
annotations:
26-
message: |
27-
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
28-
expr: |
29-
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
30-
for: 15m
31-
labels:
32-
severity: critical
33-
- alert: LokiTooManyCompactorsRunning
34-
annotations:
35-
message: |
36-
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
37-
expr: |
38-
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
39-
for: 5m
40-
labels:
41-
severity: warning
2+
- name: loki_alerts
3+
rules:
4+
- alert: LokiRequestErrors
5+
annotations:
6+
message: |
7+
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
8+
expr: |
9+
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
10+
/
11+
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
12+
> 10
13+
for: 15m
14+
labels:
15+
severity: critical
16+
- alert: LokiRequestPanics
17+
annotations:
18+
message: |
19+
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
20+
expr: |
21+
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
22+
labels:
23+
severity: critical
24+
- alert: LokiRequestLatency
25+
annotations:
26+
message: |
27+
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
28+
expr: |
29+
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
30+
for: 15m
31+
labels:
32+
severity: critical
33+
- alert: LokiTooManyCompactorsRunning
34+
annotations:
35+
message: |
36+
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
37+
expr: |
38+
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
39+
for: 5m
40+
labels:
41+
severity: warning

production/loki-mixin-compiled-ssd/dashboards/loki-logs.json

+10-10
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
"steppedLine": false,
6464
"targets": [
6565
{
66-
"expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})",
66+
"expr": "sum(go_goroutines{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})",
6767
"refId": "A"
6868
}
6969
],
@@ -149,7 +149,7 @@
149149
"steppedLine": false,
150150
"targets": [
151151
{
152-
"expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)",
152+
"expr": "sum(go_gc_duration_seconds{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)",
153153
"legendFormat": "{{quantile}}",
154154
"refId": "A"
155155
}
@@ -236,7 +236,7 @@
236236
"steppedLine": false,
237237
"targets": [
238238
{
239-
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))",
239+
"expr": "sum(rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))",
240240
"refId": "A"
241241
}
242242
],
@@ -322,7 +322,7 @@
322322
"steppedLine": false,
323323
"targets": [
324324
{
325-
"expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})",
325+
"expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})",
326326
"refId": "A"
327327
}
328328
],
@@ -408,7 +408,7 @@
408408
"steppedLine": false,
409409
"targets": [
410410
{
411-
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
411+
"expr": "sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
412412
"refId": "A"
413413
}
414414
],
@@ -494,7 +494,7 @@
494494
"steppedLine": false,
495495
"targets": [
496496
{
497-
"expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
497+
"expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))",
498498
"refId": "A"
499499
}
500500
],
@@ -580,7 +580,7 @@
580580
"steppedLine": false,
581581
"targets": [
582582
{
583-
"expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0",
583+
"expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0",
584584
"legendFormat": "{{reason}}",
585585
"refId": "A"
586586
}
@@ -667,7 +667,7 @@
667667
"steppedLine": false,
668668
"targets": [
669669
{
670-
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)",
670+
"expr": "sum(rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)",
671671
"legendFormat": "{{level}}",
672672
"refId": "A"
673673
}
@@ -771,7 +771,7 @@
771771
"steppedLine": false,
772772
"targets": [
773773
{
774-
"expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)",
774+
"expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)",
775775
"intervalFactor": 3,
776776
"legendFormat": "{{level}}",
777777
"refId": "A"
@@ -836,7 +836,7 @@
836836
},
837837
"targets": [
838838
{
839-
"expr": "{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"",
839+
"expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"",
840840
"refId": "A"
841841
}
842842
],

0 commit comments

Comments
 (0)