diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index 5653fd4a2e0..e5c49cca2dd 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -62,6 +62,8 @@ jobs: ln -s $GITHUB_WORKSPACE/* /go/src/github.com/grafana/mimir - name: Check Mixin run: make BUILD_IN_CONTAINER=false check-mixin + - name: Check Mixin Tests + run: make BUILD_IN_CONTAINER=false check-mixin-tests - name: Check Jsonnet Manifests run: make BUILD_IN_CONTAINER=false check-jsonnet-manifests - name: Check Jsonnet Getting Started diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e3f954b8db..60d966136c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ - `MimirContinuousTestFailed` * [ENHANCEMENT] Added `per_cluster_label` support to allow to change the label name used to differentiate between Kubernetes clusters. #1651 * [BUGFIX] Dashboards: Fix "Failed evaluation rate" panel on Tenants dashboard. #1629 +* [BUGFIX] Honor the configured `per_instance_label` in all dashboards and alerts. #1697 ### Jsonnet diff --git a/Makefile b/Makefile index fad6ee58f0a..c61aebd0b03 100644 --- a/Makefile +++ b/Makefile @@ -207,7 +207,7 @@ GOVOLUMES= -v $(shell pwd)/.cache:/go/cache:delegated,z \ # Mount local ssh credentials to be able to clone private repos when doing `mod-check` SSHVOLUME= -v ~/.ssh/:/root/.ssh:delegated,z -exes $(EXES) protos $(PROTO_GOS) lint test test-with-race cover shell mod-check check-protos doc format dist build-mixin format-mixin: fetch-build-image +exes $(EXES) protos $(PROTO_GOS) lint test test-with-race cover shell mod-check check-protos doc format dist build-mixin format-mixin check-mixin-tests license check-license: fetch-build-image @mkdir -p $(shell pwd)/.pkg @mkdir -p $(shell pwd)/.cache @echo @@ -331,7 +331,7 @@ doc: clean-doc $(DOC_TEMPLATES:.template=.md) $(DOC_EMBED:.md=.md.embedmd) # Add license header to files. license: - go run ./tools/add-license ./cmd ./integration ./pkg ./tools ./development ./mimir-build-image + go run ./tools/add-license ./cmd ./integration ./pkg ./tools ./development ./mimir-build-image ./operations check-license: license @git diff --exit-code || (echo "Please add the license header running 'make BUILD_IN_CONTAINER=false license'" && false) @@ -375,6 +375,9 @@ build-mixin: check-mixin-jb @cd $(MIXIN_OUT_PATH)/.. && zip -q -r mimir-mixin.zip $$(basename "$(MIXIN_OUT_PATH)") @echo "The mixin has been compiled to $(MIXIN_OUT_PATH) and archived to $$(realpath --relative-to=$$(pwd) $(MIXIN_OUT_PATH)/../mimir-mixin.zip)" +check-mixin-tests: + @./operations/mimir-mixin-tests/run.sh || (echo "Mixin tests are failing. Please fix the reported issues. You can run mixin tests with 'make check-mixin-tests'" && false) + format-mixin: @find $(MIXIN_PATH) -type f -name '*.libsonnet' | xargs jsonnetfmt -i diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 5fc14406cf4..d685790365d 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -95,7 +95,7 @@ groups: severity: warning - alert: MimirIngesterRestarts annotations: - message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" + message: '{{ $labels.job }}/{{ $labels.pod }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.' expr: | changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2 @@ -118,7 +118,7 @@ groups: severity: critical - alert: MimirMemoryMapAreasTooHigh annotations: - message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas + message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas close to the limit.' expr: | process_memory_map_areas{job=~".+(cortex|ingester.*|store-gateway.*)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester.*|store-gateway.*)"} > 0.8 @@ -130,7 +130,7 @@ groups: - alert: MimirIngesterReachingSeriesLimit annotations: message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. expr: | ( (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) @@ -143,7 +143,7 @@ groups: - alert: MimirIngesterReachingSeriesLimit annotations: message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. expr: | ( (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) @@ -156,7 +156,7 @@ groups: - alert: MimirIngesterReachingTenantsLimit annotations: message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. expr: | ( (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) @@ -169,7 +169,7 @@ groups: - alert: MimirIngesterReachingTenantsLimit annotations: message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. expr: | ( (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) @@ -182,7 +182,7 @@ groups: - alert: MimirReachingTCPConnectionsLimit annotations: message: | - Mimir instance {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. expr: | cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and cortex_tcp_connections_limit > 0 @@ -192,7 +192,7 @@ groups: - alert: MimirDistributorReachingInflightPushRequestLimit annotations: message: | - Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. + Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. expr: | ( (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) @@ -299,12 +299,12 @@ groups: - alert: MimirRulerTooManyFailedPushes annotations: message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. expr: | 100 * ( - sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_failed_total[1m])) + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) / - sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_total[1m])) + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) ) > 1 for: 5m labels: @@ -312,12 +312,12 @@ groups: - alert: MimirRulerTooManyFailedQueries annotations: message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. expr: | 100 * ( - sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_failed_total[1m])) + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) / - sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_total[1m])) + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) ) > 1 for: 5m labels: @@ -325,11 +325,11 @@ groups: - alert: MimirRulerMissedEvaluations annotations: message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. expr: | - sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 for: 5m labels: @@ -348,7 +348,7 @@ groups: rules: - alert: MimirGossipMembersMismatch annotations: - message: Mimir instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} sees incorrect number of gossip members. expr: | memberlist_client_cluster_members_count @@ -390,7 +390,7 @@ groups: - alert: MimirAlertmanagerSyncConfigsFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. expr: | rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 for: 30m @@ -399,7 +399,7 @@ groups: - alert: MimirAlertmanagerRingCheckFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. expr: | rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 for: 10m @@ -408,7 +408,7 @@ groups: - alert: MimirAlertmanagerPartialStateMergeFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. expr: | rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 for: 10m @@ -417,7 +417,7 @@ groups: - alert: MimirAlertmanagerReplicationFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. expr: | rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 for: 10m @@ -426,7 +426,7 @@ groups: - alert: MimirAlertmanagerPersistStateFailing annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. expr: | rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 for: 1h @@ -435,7 +435,7 @@ groups: - alert: MimirAlertmanagerInitialSyncFailed annotations: message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. expr: | increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 labels: @@ -476,7 +476,7 @@ groups: # Only if the ingester has ingested samples over the last 4h. (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) and - # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. @@ -497,7 +497,7 @@ groups: severity: critical - alert: MimirIngesterHasUnshippedBlocks annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet. expr: | @@ -509,7 +509,7 @@ groups: severity: critical - alert: MimirIngesterTSDBHeadCompactionFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head. expr: | rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 @@ -518,7 +518,7 @@ groups: severity: critical - alert: MimirIngesterTSDBHeadTruncationFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head. expr: | rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 @@ -526,7 +526,7 @@ groups: severity: critical - alert: MimirIngesterTSDBCheckpointCreationFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint. expr: | rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 @@ -534,7 +534,7 @@ groups: severity: critical - alert: MimirIngesterTSDBCheckpointDeletionFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint. expr: | rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 @@ -542,7 +542,7 @@ groups: severity: critical - alert: MimirIngesterTSDBWALTruncationFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL. expr: | rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 @@ -550,7 +550,7 @@ groups: severity: warning - alert: MimirIngesterTSDBWALCorrupted annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. expr: | rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0 @@ -558,7 +558,7 @@ groups: severity: critical - alert: MimirIngesterTSDBWALWritesFailed annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL. expr: | rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 @@ -567,7 +567,7 @@ groups: severity: critical - alert: MimirQuerierHasNotScanTheBucket annotations: - message: Mimir Querier {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace + message: Mimir Querier {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}. expr: | @@ -598,9 +598,9 @@ groups: severity: warning - alert: MimirStoreGatewayHasNotSyncTheBucket annotations: - message: Mimir Store Gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not successfully synched the bucket since {{ $value - | humanizeDuration }}. + message: Mimir Store Gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not successfully synched the bucket since {{ $value | humanizeDuration + }}. expr: | (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) and @@ -630,9 +630,8 @@ groups: rules: - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not successfully cleaned up blocks in the last 6 - hours. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not successfully cleaned up blocks in the last 6 hours. expr: | (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) for: 1h @@ -640,8 +639,8 @@ groups: severity: critical - alert: MimirCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not run compaction in the last 24 hours. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not run compaction in the last 24 hours. expr: | (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) and @@ -651,8 +650,8 @@ groups: severity: critical - alert: MimirCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not run compaction in the last 24 hours. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not run compaction in the last 24 hours. expr: | cortex_compactor_last_successful_run_timestamp_seconds == 0 for: 24h @@ -660,16 +659,16 @@ groups: severity: critical - alert: MimirCompactorHasNotSuccessfullyRunCompaction annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} failed to run 2 consecutive compactions. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} failed to run 2 consecutive compactions. expr: | increase(cortex_compactor_runs_failed_total[2h]) >= 2 labels: severity: critical - alert: MimirCompactorHasNotUploadedBlocks annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not uploaded any block in the last 24 hours. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not uploaded any block in the last 24 hours. expr: | (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/(compactor.*|cortex|mimir)"} > 60 * 60 * 24) and @@ -679,8 +678,8 @@ groups: severity: critical - alert: MimirCompactorHasNotUploadedBlocks annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not uploaded any block in the last 24 hours. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has not uploaded any block in the last 24 hours. expr: | thanos_objstore_bucket_last_successful_upload_time{job=~".+/(compactor.*|cortex|mimir)"} == 0 for: 24h @@ -688,8 +687,8 @@ groups: severity: critical - alert: MimirCompactorSkippedBlocksWithOutOfOrderChunks annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has found and ignored blocks with out of order chunks. + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace + }} has found and ignored blocks with out of order chunks. expr: | increase(cortex_compactor_blocks_marked_for_no_compaction_total{job=~".+/(compactor.*|cortex|mimir)", reason="block-index-out-of-order-chunk"}[5m]) > 0 for: 1m diff --git a/operations/mimir-mixin-tests/.gitignore b/operations/mimir-mixin-tests/.gitignore new file mode 100644 index 00000000000..6aa072ba314 --- /dev/null +++ b/operations/mimir-mixin-tests/.gitignore @@ -0,0 +1,2 @@ +# Any test output directory. +test-*/ diff --git a/operations/mimir-mixin-tests/run.sh b/operations/mimir-mixin-tests/run.sh new file mode 100755 index 00000000000..3fb846d06a8 --- /dev/null +++ b/operations/mimir-mixin-tests/run.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +MIXIN_DIR="${SCRIPT_DIR}/../mimir-mixin" +MIXIN_TEST_FILEPATH="${MIXIN_DIR}/mixin-test.libsonnet" + +# Cleanup temporary copy of test file when this script exits. +trap "rm -f ${MIXIN_TEST_FILEPATH}" EXIT + +TESTS=$(ls -1 "${SCRIPT_DIR}"/test-*.libsonnet) + +for FILEPATH in $TESTS; do + # Extract the filename (without extension). + TEST_NAME=$(basename -s '.libsonnet' "$FILEPATH") + TEST_DIR="${SCRIPT_DIR}/${TEST_NAME}" + + # Begin with a clean output dir. + rm -rf "${TEST_DIR}" && mkdir "${TEST_DIR}" + + # Temporarily copy the test file to the mixin directory. This file is deleted once + # this script exits. + cp "${SCRIPT_DIR}/${TEST_NAME}.libsonnet" "${MIXIN_TEST_FILEPATH}" + + mixtool generate all \ + --output-alerts "${TEST_DIR}/alerts.yaml" \ + --output-rules "${TEST_DIR}/rules.yaml" \ + --directory "${TEST_DIR}/dashboards" \ + "${MIXIN_TEST_FILEPATH}" + + # Run assertions. + "${SCRIPT_DIR}"/"${TEST_NAME}-asserts.sh" +done diff --git a/operations/mimir-mixin-tests/test-custom-labels-asserts.sh b/operations/mimir-mixin-tests/test-custom-labels-asserts.sh new file mode 100755 index 00000000000..ff2fce24895 --- /dev/null +++ b/operations/mimir-mixin-tests/test-custom-labels-asserts.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +DASHBOARDS_DIR="${SCRIPT_DIR}"/test-custom-labels/dashboards +DASHBOARDS_FILES=$(ls -1 "${DASHBOARDS_DIR}"/*.json) +ALERTS_FILE="${SCRIPT_DIR}"/test-custom-labels/alerts.yaml +RULES_FILE="${SCRIPT_DIR}"/test-custom-labels/rules.yaml +FAILED=0 + +assert_failed() { + MSG=$1 + + echo "" + echo -e "$MSG" + echo "" + FAILED=1 +} + +# In this test we customize some labels. We expect to not find any reference to default labels. +FORBIDDEN_LABELS="cluster instance pod" + +for LABEL in ${FORBIDDEN_LABELS}; do + QUERY_REGEX="[^\$a-z_]${LABEL}[^a-z_]" + ALERT_VARIABLE_REGEX="\\\$labels.${LABEL}" + RULE_PREFIX_REGEX="(^|[^a-z])${LABEL}(\$|[^a-z])" + + # Check dashboards. + for FILEPATH in ${DASHBOARDS_FILES}; do + echo "Checking ${FILEPATH}" + + MATCHES=$(jq '.. | select(.expr?).expr' "${FILEPATH}" | grep -E "${QUERY_REGEX}" || true) + if [ -n "$MATCHES" ]; then + assert_failed "The dashboard at ${FILEPATH} contains unexpected references to '${LABEL}' label in some queries:\n$MATCHES" + fi + done + + # Check alerts. + echo "Checking ${ALERTS_FILE}" + MATCHES=$(yq eval '.. | select(.expr?).expr | sub("\n", " ")' "${ALERTS_FILE}" | grep -E "${QUERY_REGEX}" || true) + if [ -n "$MATCHES" ]; then + assert_failed "The alerts at ${ALERTS_FILE} contains unexpected references to '${LABEL}' label in some queries:\n$MATCHES" + fi + + MATCHES=$(yq eval '.. | select(.message?).message | sub("\n", " ")' "${ALERTS_FILE}" | grep -E "${ALERT_VARIABLE_REGEX}" || true) + if [ -n "$MATCHES" ]; then + assert_failed "The alerts at ${ALERTS_FILE} contains unexpected references to '${LABEL}' label in some messages:\n$MATCHES" + fi + + # Check rules. + echo "Checking ${RULES_FILE}" + MATCHES=$(yq eval '.. | select(.expr?).expr | sub("\n", " ")' "${RULES_FILE}" | grep -E "${QUERY_REGEX}" || true) + if [ -n "$MATCHES" ]; then + assert_failed "The rules at ${RULES_FILE} contains unexpected references to '${LABEL}' label in some queries:\n$MATCHES" + fi + + MATCHES=$(yq eval '.. | select(.record?).record | sub("\n", " ")' "${RULES_FILE}" | grep -Eo '^[^:]+' | grep -E "${RULE_PREFIX_REGEX}" || true) + if [ -n "$MATCHES" ]; then + assert_failed "The rules at ${RULES_FILE} contains unexpected references to '${LABEL}' label in some rule name prefix:\n$MATCHES" + fi +done + +if [ $FAILED -ne 0 ]; then + exit 1 +fi diff --git a/operations/mimir-mixin-tests/test-custom-labels.libsonnet b/operations/mimir-mixin-tests/test-custom-labels.libsonnet new file mode 100644 index 00000000000..19aa2e0214a --- /dev/null +++ b/operations/mimir-mixin-tests/test-custom-labels.libsonnet @@ -0,0 +1,7 @@ +(import 'mixin-compiled.libsonnet') + { + _config+:: { + per_cluster_label: 'mycluster', + per_instance_label: 'myinstance', + per_node_label: 'mynode', + }, +} diff --git a/operations/mimir-mixin-tools/screenshots/Dockerfile b/operations/mimir-mixin-tools/screenshots/Dockerfile index edd4640408f..331d5892bfa 100644 --- a/operations/mimir-mixin-tools/screenshots/Dockerfile +++ b/operations/mimir-mixin-tools/screenshots/Dockerfile @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-only + FROM node:17-bullseye # Install Chrome dependencies. diff --git a/operations/mimir-mixin-tools/screenshots/run.sh b/operations/mimir-mixin-tools/screenshots/run.sh index 7fca30420ca..e2808637dc3 100755 --- a/operations/mimir-mixin-tools/screenshots/run.sh +++ b/operations/mimir-mixin-tools/screenshots/run.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + set -e SCRIPT_DIR=$(cd `dirname $0` && pwd) diff --git a/operations/mimir-mixin-tools/serve/run.sh b/operations/mimir-mixin-tools/serve/run.sh index 40c1ef68383..1bbe01143cc 100755 --- a/operations/mimir-mixin-tools/serve/run.sh +++ b/operations/mimir-mixin-tools/serve/run.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + set -e SCRIPT_DIR=$(cd `dirname $0` && pwd) diff --git a/operations/mimir-mixin/.gitignore b/operations/mimir-mixin/.gitignore index 57872d0f1e5..0a51c616cb6 100644 --- a/operations/mimir-mixin/.gitignore +++ b/operations/mimir-mixin/.gitignore @@ -1 +1,2 @@ /vendor/ +mixin-test.libsonnet diff --git a/operations/mimir-mixin/alerts.libsonnet b/operations/mimir-mixin/alerts.libsonnet index 10ab68dcb5a..10f936e8d8b 100644 --- a/operations/mimir-mixin/alerts.libsonnet +++ b/operations/mimir-mixin/alerts.libsonnet @@ -1,10 +1,10 @@ { prometheusAlerts+:: + { _config:: $._config + $._group_config } + (import 'alerts/alerts.libsonnet') + (import 'alerts/alertmanager.libsonnet') + (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + (import 'alerts/autoscaling.libsonnet') + - (import 'alerts/continuous-test.libsonnet') + - { _config:: $._config + $._group_config }, + (import 'alerts/continuous-test.libsonnet'), } diff --git a/operations/mimir-mixin/alerts/alertmanager.libsonnet b/operations/mimir-mixin/alerts/alertmanager.libsonnet index 262ecfeec58..7a8cda14691 100644 --- a/operations/mimir-mixin/alerts/alertmanager.libsonnet +++ b/operations/mimir-mixin/alerts/alertmanager.libsonnet @@ -14,7 +14,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s is failing to read tenant configurations from storage. ||| % $._config, }, }, @@ -29,7 +29,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s is unable to check tenants ownership via the ring. ||| % $._config, }, }, @@ -44,7 +44,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s is failing to merge partial state changes received from a replica. ||| % $._config, }, }, @@ -59,7 +59,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s is failing to replicating partial state to its replicas. ||| % $._config, }, }, @@ -74,7 +74,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s is unable to persist full state snaphots to remote storage. ||| % $._config, }, }, @@ -88,7 +88,7 @@ }, annotations: { message: ||| - %(product)s Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + %(product)s Alertmanager {{ $labels.job }}/%(alert_instance_variable)s was unable to obtain some initial state when starting up. ||| % $._config, }, }, @@ -105,7 +105,7 @@ }, annotations: { message: ||| - Alertmanager {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + Alertmanager %(alert_instance_variable)s in %(alert_aggregation_variables)s is using too much memory. ||| % $._config, }, }, @@ -122,7 +122,7 @@ }, annotations: { message: ||| - Alertmanager {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + Alertmanager %(alert_instance_variable)s in %(alert_aggregation_variables)s is using too much memory. ||| % $._config, }, }, diff --git a/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/operations/mimir-mixin/alerts/alerts-utils.libsonnet index 7005d20121d..8c748c0c607 100644 --- a/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -1,4 +1,10 @@ { + _config+:: { + alert_cluster_variable: '{{ $labels.%s }}' % $._config.per_cluster_label, + alert_instance_variable: '{{ $labels.%s }}' % $._config.per_instance_label, + alert_node_variable: '{{ $labels.%s }}' % $._config.per_cluster_label, + }, + // The alert name is prefixed with the product name (eg. AlertName -> MimirAlertName). alertName(name):: $._config.product + name, diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index db5fe341f61..c6df49aafad 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -184,27 +184,27 @@ severity: 'warning', }, annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.', + message: '{{ $labels.job }}/%(alert_instance_variable)s has restarted {{ printf "%%.2f" $value }} times in the last 30 mins.' % $._config, }, }, { alert: $.alertName('KVStoreFailure'), expr: ||| ( - sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) / - sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) ) # We want to get alerted only in case there's a constant failure. == 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % $._config, 'for': '5m', labels: { severity: 'critical', }, annotations: { message: ||| - %(product)s {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to the KV store {{ $labels.kv_name }}. + %(product)s %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to talk to the KV store {{ $labels.kv_name }}. ||| % $._config, }, }, @@ -218,7 +218,7 @@ severity: 'critical', }, annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas close to the limit.', + message: '{{ $labels.job }}/%(alert_instance_variable)s has a number of mmap-ed areas close to the limit.' % $._config, }, }, ], @@ -241,8 +241,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. - |||, + Ingester {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its series limit. + ||| % $._config, }, }, { @@ -260,8 +260,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. - |||, + Ingester {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its series limit. + ||| % $._config, }, }, { @@ -279,8 +279,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. - |||, + Ingester {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its tenant limit. + ||| % $._config, }, }, { @@ -298,8 +298,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. - |||, + Ingester {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its tenant limit. + ||| % $._config, }, }, { @@ -314,7 +314,7 @@ }, annotations: { message: ||| - %(product)s instance {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + %(product)s instance {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. ||| % $._config, }, }, @@ -333,8 +333,8 @@ }, annotations: { message: ||| - Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. - |||, + Distributor {{ $labels.job }}/%(alert_instance_variable)s has reached {{ $value | humanizePercentage }} of its inflight push request limit. + ||| % $._config, }, }, ], @@ -462,7 +462,7 @@ }, annotations: { message: ||| - Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is using too much memory. ||| % $._config, }, }, @@ -481,7 +481,7 @@ }, annotations: { message: ||| - Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is using too much memory. ||| % $._config, }, }, @@ -494,18 +494,18 @@ alert: $.alertName('RulerTooManyFailedPushes'), expr: ||| 100 * ( - sum by (%s, instance) (rate(cortex_ruler_write_requests_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_failed_total[1m])) / - sum by (%s, instance) (rate(cortex_ruler_write_requests_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_write_requests_total[1m])) ) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % $._config, 'for': '5m', labels: { severity: 'critical', }, annotations: { message: ||| - %(product)s Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors. + %(product)s Ruler %(alert_instance_variable)s in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors. ||| % $._config, }, }, @@ -513,36 +513,36 @@ alert: $.alertName('RulerTooManyFailedQueries'), expr: ||| 100 * ( - sum by (%s, instance) (rate(cortex_ruler_queries_failed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_failed_total[1m])) / - sum by (%s, instance) (rate(cortex_ruler_queries_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ruler_queries_total[1m])) ) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % $._config, 'for': '5m', labels: { severity: 'warning', }, annotations: { message: ||| - %(product)s Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules. + %(product)s Ruler %(alert_instance_variable)s in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules. ||| % $._config, }, }, { alert: $.alertName('RulerMissedEvaluations'), expr: ||| - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % $._config, 'for': '5m', labels: { severity: 'warning', }, annotations: { message: ||| - %(product)s Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}. + %(product)s Ruler %(alert_instance_variable)s in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}. ||| % $._config, }, }, @@ -580,7 +580,7 @@ severity: 'warning', }, annotations: { - message: '%(product)s instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config, + message: '%(product)s instance %(alert_instance_variable)s in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config, }, }, ], @@ -603,8 +603,8 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - |||, + Too much memory being used by {{ $labels.namespace }}/%(alert_instance_variable)s - bump memory limit. + ||| % $._config, }, }, { @@ -622,8 +622,8 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - |||, + Too much memory being used by {{ $labels.namespace }}/%(alert_instance_variable)s - bump memory limit. + ||| % $._config, }, }, ], diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet index 1feba86fdae..d6b45e7a063 100644 --- a/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/operations/mimir-mixin/alerts/blocks.libsonnet @@ -16,7 +16,7 @@ # Only if the ingester has ingested samples over the last 4h. (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) and - # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. @@ -61,7 +61,7 @@ severity: 'critical', }, annotations: { - message: "%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config, + message: "%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config, }, }, { @@ -77,7 +77,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config, }, }, { @@ -89,7 +89,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config, }, }, { @@ -101,7 +101,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config, }, }, { @@ -113,7 +113,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config, }, }, { @@ -125,7 +125,7 @@ severity: 'warning', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config, }, }, { @@ -137,7 +137,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, }, }, { @@ -150,7 +150,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config, + message: '%(product)s Ingester %(alert_instance_variable)s in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config, }, }, { @@ -166,7 +166,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config, + message: '%(product)s Querier %(alert_instance_variable)s in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { @@ -206,7 +206,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config, + message: '%(product)s Store Gateway %(alert_instance_variable)s in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { diff --git a/operations/mimir-mixin/alerts/compactor.libsonnet b/operations/mimir-mixin/alerts/compactor.libsonnet index 7eda7984e1b..dbf0147269a 100644 --- a/operations/mimir-mixin/alerts/compactor.libsonnet +++ b/operations/mimir-mixin/alerts/compactor.libsonnet @@ -14,7 +14,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully cleaned up blocks in the last 6 hours.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has not successfully cleaned up blocks in the last 6 hours.' % $._config, }, }, { @@ -30,7 +30,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -44,7 +44,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -57,7 +57,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s failed to run 2 consecutive compactions.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s failed to run 2 consecutive compactions.' % $._config, }, }, { @@ -73,7 +73,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, { @@ -87,7 +87,7 @@ severity: 'critical', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, { @@ -101,7 +101,7 @@ severity: 'warning', }, annotations: { - message: '%(product)s Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has found and ignored blocks with out of order chunks.' % $._config, + message: '%(product)s Compactor %(alert_instance_variable)s in %(alert_aggregation_variables)s has found and ignored blocks with out of order chunks.' % $._config, }, }, ], diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 680e0eb5fee..abe6987c4ed 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -297,23 +297,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, jobNetworkingRow(title, name):: + local vars = $._config { + job_matcher: $.jobMatcher($._config.job_names[name]), + }; + super.row(title) .addPanel($.containerNetworkReceiveBytesPanel($._config.instance_names[name])) .addPanel($.containerNetworkTransmitBytesPanel($._config.instance_names[name])) .addPanel( $.panel('Inflight requests (per pod)') + $.queryPanel([ - 'avg(cortex_inflight_requests{%s})' % $.jobMatcher($._config.job_names[name]), - 'max(cortex_inflight_requests{%s})' % $.jobMatcher($._config.job_names[name]), + 'avg(cortex_inflight_requests{%(job_matcher)s})' % vars, + 'max(cortex_inflight_requests{%(job_matcher)s})' % vars, ], ['avg', 'highest']) + { fill: 0 } ) .addPanel( $.panel('TCP connections (per pod)') + $.queryPanel([ - 'avg(sum by(pod) (cortex_tcp_connections{%s}))' % $.jobMatcher($._config.job_names[name]), - 'max(sum by(pod) (cortex_tcp_connections{%s}))' % $.jobMatcher($._config.job_names[name]), - 'min(cortex_tcp_connections_limit{%s})' % $.jobMatcher($._config.job_names[name]), + 'avg(sum by(%(per_instance_label)s) (cortex_tcp_connections{%(job_matcher)s}))' % vars, + 'max(sum by(%(per_instance_label)s) (cortex_tcp_connections{%(job_matcher)s}))' % vars, + 'min(cortex_tcp_connections_limit{%(job_matcher)s})' % vars, ], ['avg', 'highest', 'limit']) + { fill: 0 } ), diff --git a/operations/mimir-mixin/recording_rules.libsonnet b/operations/mimir-mixin/recording_rules.libsonnet index b2878681447..c76be953d7c 100644 --- a/operations/mimir-mixin/recording_rules.libsonnet +++ b/operations/mimir-mixin/recording_rules.libsonnet @@ -257,7 +257,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. @@ -283,7 +283,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. @@ -299,7 +299,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. @@ -336,7 +336,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( container_memory_usage_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. @@ -362,7 +362,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. @@ -378,7 +378,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; label_replace( label_replace( kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. diff --git a/operations/mimir-mixin/scripts/lint-playbooks.sh b/operations/mimir-mixin/scripts/lint-playbooks.sh index 4673bfcbe51..915c83c826b 100755 --- a/operations/mimir-mixin/scripts/lint-playbooks.sh +++ b/operations/mimir-mixin/scripts/lint-playbooks.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only set -eu -o pipefail diff --git a/operations/mimir-tests/build.sh b/operations/mimir-tests/build.sh index 8fcc24cd4c2..25f0296e40f 100755 --- a/operations/mimir-tests/build.sh +++ b/operations/mimir-tests/build.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + set -euo pipefail # Start from a clean setup. diff --git a/operations/mimir/README.md b/operations/mimir/README.md index 68315b39c7f..f9971cd5431 100644 --- a/operations/mimir/README.md +++ b/operations/mimir/README.md @@ -22,6 +22,8 @@ Initialise the Tanka, install the Mimir and Kubernetes Jsonnet libraries, and se [embedmd]:# (./getting-started.sh) ```sh #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + set -e # Initialise the Tanka. diff --git a/operations/mimir/getting-started.sh b/operations/mimir/getting-started.sh index abe8532ee85..b5c31dfe517 100755 --- a/operations/mimir/getting-started.sh +++ b/operations/mimir/getting-started.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# SPDX-License-Identifier: AGPL-3.0-only + set -e # Initialise the Tanka.