Skip to content

Commit

Permalink
Use alert_aggreation at the obvious locations
Browse files Browse the repository at this point in the history
Signed-off-by: Whyeasy <[email protected]>
  • Loading branch information
Whyeasy committed Apr 8, 2022
1 parent ca1b6d6 commit 7b32d0a
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 43 deletions.
2 changes: 1 addition & 1 deletion operations/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@
alert: $.alertName('ProvisioningTooManyWrites'),
// 80k writes / s per ingester max.
expr: |||
avg by (%(alert_aggregation_labels)s) (%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
avg by (%(alert_aggregation_labels)s) (%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
||| % $._config,
'for': '15m',
labels: {
Expand Down
6 changes: 3 additions & 3 deletions operations/mimir-mixin/alerts/blocks.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
||| % $._config,
labels: {
severity: 'critical',
Expand All @@ -37,7 +37,7 @@
expr: |||
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
(max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
||| % $._config,
labels: {
severity: 'critical',
Expand Down
4 changes: 2 additions & 2 deletions operations/mimir-mixin/dashboards/scaling.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.tablePanel([
|||
sort_desc(
%(clusterLabel)s_namespace_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
> ignoring(reason) group_left
%(clusterLabel)s_namespace_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
)
||| % $._config,
], {
Expand Down
9 changes: 9 additions & 0 deletions operations/mimir-mixin/groups.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,14 @@
),
),
),
alert_aggregation_rule_prefix:
std.join(
'_',
// Split the configured labels by comma and remove whitespaces.
std.map(
function(l) std.strReplace(l, ' ', ''),
std.split($._config.alert_aggregation_labels, ',')
),
),
},
}
74 changes: 37 additions & 37 deletions operations/mimir-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// Convenience rule to get the number of replicas for both a deployment and a statefulset.
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
record: '%(clusterLabel)s_namespace_deployment:actual_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count' % _config,
expr: |||
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
Expand All @@ -124,22 +124,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
or
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
)
||| % _config,
},
{
// Distributors should be able to deal with 240k samples/s.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'distributor',
reason: 'sample_rate',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by (%(clusterLabel)s, namespace) (
sum by (%(alert_aggregation_labels)s) (
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
)[24h:]
)
Expand All @@ -150,30 +150,30 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// We should be about to cover 80% of our limits,
// and ingester can have 80k samples/s.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'distributor',
reason: 'sample_rate_limits',
},
expr: |||
ceil(
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
)
||| % _config,
},
{
// We want ingesters each ingester to deal with 80k samples/s.
// NB we measure this at the distributors and multiple by RF (3).
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'ingester',
reason: 'sample_rate',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by (%(clusterLabel)s, namespace) (
sum by (%(alert_aggregation_labels)s) (
%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
)[24h:]
)
Expand All @@ -183,15 +183,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
{
// Ingester should have 1.5M series in memory
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'ingester',
reason: 'active_series',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by(%(clusterLabel)s, namespace) (
sum by(%(alert_aggregation_labels)s) (
cortex_ingester_memory_series
)[24h:]
)
Expand All @@ -202,47 +202,47 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// We should be about to cover 60% of our limits,
// and ingester can have 1.5M series in memory
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'ingester',
reason: 'active_series_limits',
},
expr: |||
ceil(
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
* 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
)
||| % _config,
},
{
// We should be about to cover 60% of our limits,
// and ingester can have 80k samples/s.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'ingester',
reason: 'sample_rate_limits',
},
expr: |||
ceil(
sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
)
||| % _config,
},
{
// Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
deployment: 'memcached',
reason: 'active_series',
},
expr: |||
ceil(
(sum by (%(clusterLabel)s, namespace) (
(sum by (%(alert_aggregation_labels)s) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
) / 4)
/
avg by (%(clusterLabel)s, namespace) (
avg by (%(alert_aggregation_labels)s) (
memcached_limit_bytes{job=~".+/memcached"}
)
)
Expand All @@ -251,9 +251,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// Convenience rule to get the CPU utilization for both a deployment and a statefulset.
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
record: '%(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
expr: |||
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
Expand All @@ -269,7 +269,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// Convenience rule to get the CPU request for both a deployment and a statefulset.
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
expr: |||
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
Expand All @@ -279,7 +279,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_cpu_cores was removed:
(
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_cpu_cores,
Expand All @@ -295,7 +295,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="cpu"},
Expand All @@ -313,26 +313,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Jobs should be sized to their CPU usage.
// We do this by comparing 99th percentile usage over the last 24hrs to
// their current provisioned #replicas and resource requests.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
reason: 'cpu_usage',
},
expr: |||
ceil(
%(clusterLabel)s_namespace_deployment:actual_replicas:count
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
*
quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/
%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum
)
||| % _config,
},
{
// Convenience rule to get the Memory utilization for both a deployment and a statefulset.
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
record: '%(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum' % _config,
expr: |||
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
container_memory_usage_bytes,
Expand All @@ -348,7 +348,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// Convenience rule to get the Memory request for both a deployment and a statefulset.
// Multi-zone deployments are grouped together removing the "zone-X" suffix.
record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
expr: |||
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
Expand All @@ -358,7 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_memory_bytes was removed:
(
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_memory_bytes,
Expand All @@ -374,7 +374,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (%(clusterLabel)s, namespace, deployment) (
sum by (%(alert_aggregation_labels)s, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="memory"},
Expand All @@ -392,17 +392,17 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Jobs should be sized to their Memory usage.
// We do this by comparing 99th percentile usage over the last 24hrs to
// their current provisioned #replicas and resource requests.
record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
labels: {
reason: 'memory_usage',
},
expr: |||
ceil(
%(clusterLabel)s_namespace_deployment:actual_replicas:count
%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
*
quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum[24h])
quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum[24h])
/
%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum
)
||| % _config,
},
Expand Down Expand Up @@ -479,7 +479,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
rules: [
{
// cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
record: '%s_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.clusterLabel, $._config.per_instance_label],
record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label],
expr: |||
sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
||| % $._config,
Expand Down

0 comments on commit 7b32d0a

Please sign in to comment.