Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

feat(consul): add additional autopilot health metrics #1033

Merged
merged 1 commit into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 35 additions & 30 deletions modules/consul/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,36 +36,41 @@ Labels per scope:
- node check: datacenter, node_name, check_name.
- service check: datacenter, node_name, check_name, service_name.

| Metric | Scope | Dimensions | Units | Server Leader | Server Follower | Client |
|-----------------------------------|:-------------:|:-----------------------------------------:|:-------------:|:-------------:|:---------------:|:------:|
| node_health_check_status | node check | passing, maintenance, warning, critical | status | yes | yes | yes |
| service_health_check_status | service check | passing, maintenance, warning, critical | status | yes | yes | yes |
| client_rpc_requests_rate | global | rpc | requests/s | yes | yes | yes |
| client_rpc_requests_exceeded_rate | global | exceeded | requests/s | yes | yes | yes |
| client_rpc_requests_failed_rate | global | failed | requests/s | yes | yes | yes |
| memory_allocated | global | allocated | bytes | yes | yes | yes |
| memory_sys | global | sys | bytes | yes | yes | yes |
| gc_pause_time | global | gc_pause | seconds | yes | yes | yes |
| kvs_apply_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |
| kvs_apply_operations_rate | global | kvs_apply | ops/s | yes | yes | no |
| txn_apply_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |
| txn_apply_operations_rate | global | txn_apply | ops/s | yes | yes | no |
| raft_commit_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
| raft_commits_rate | global | commits | commits/s | yes | no | no |
| autopilot_health_status | global | healthy, unhealthy | status | yes | yes | no |
| autopilot_failure_tolerance | global | failure_tolerance | servers | yes | yes | no |
| raft_leader_last_contact_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
| raft_leader_elections_rate | global | leader | elections/s | yes | yes | no |
| raft_leadership_transitions_rate | global | leadership | transitions/s | yes | yes | no |
| server_leadership_status | global | leader, not_leader | status | yes | yes | no |
| raft_thread_main_saturation_perc | global | quantile_0.5, quantile_0.9, quantile_0.99 | percentage | yes | yes | no |
| raft_thread_fsm_saturation_perc | global | quantile_0.5, quantile_0.9, quantile_0.99 | percentage | yes | yes | no |
| raft_fsm_last_restore_duration | global | last_restore_duration | ms | yes | yes | no |
| raft_leader_oldest_log_age | global | oldest_log_age | seconds | yes | no | no |
| raft_rpc_install_snapshot_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | no | yes | no |
| raft_boltdb_freelist_bytes | global | freelist | bytes | yes | yes | no |
| raft_boltdb_logs_per_batch_rate | global | written | logs/s | yes | yes | no |
| raft_boltdb_store_logs_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |
| Metric | Scope | Dimensions | Units | Server Leader | Server Follower | Client |
|----------------------------------------|:-------------:|:-----------------------------------------:|:-------------:|:-------------:|:---------------:|:------:|
| node_health_check_status | node check | passing, maintenance, warning, critical | status | yes | yes | yes |
| service_health_check_status | service check | passing, maintenance, warning, critical | status | yes | yes | yes |
| client_rpc_requests_rate | global | rpc | requests/s | yes | yes | yes |
| client_rpc_requests_exceeded_rate | global | exceeded | requests/s | yes | yes | yes |
| client_rpc_requests_failed_rate | global | failed | requests/s | yes | yes | yes |
| memory_allocated | global | allocated | bytes | yes | yes | yes |
| memory_sys | global | sys | bytes | yes | yes | yes |
| gc_pause_time | global | gc_pause | seconds | yes | yes | yes |
| kvs_apply_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |
| kvs_apply_operations_rate | global | kvs_apply | ops/s | yes | yes | no |
| txn_apply_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |
| txn_apply_operations_rate | global | txn_apply | ops/s | yes | yes | no |
| raft_commit_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
| raft_commits_rate | global | commits | commits/s | yes | no | no |
| autopilot_health_status | global | healthy, unhealthy | status | yes | yes | no |
| autopilot_failure_tolerance | global | failure_tolerance | servers | yes | yes | no |
| autopilot_server_health_status | global | healthy, unhealthy | status | yes | yes | no |
| autopilot_server_stable_time | global | stable | seconds | yes | yes | no |
| autopilot_server_serf_status | global | active, failed, left, none | status | yes | yes | no |
| autopilot_server_voter_status | global | voter, not_voter | status | yes | yes | no |
| raft_leader_last_contact_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
| raft_follower_last_contact_leader_time | global | leader_last_contact | ms | no | yes | no |
| raft_leader_elections_rate | global | leader | elections/s | yes | yes | no |
| raft_leadership_transitions_rate | global | leadership | transitions/s | yes | yes | no |
| server_leadership_status | global | leader, not_leader | status | yes | yes | no |
| raft_thread_main_saturation_perc | global | quantile_0.5, quantile_0.9, quantile_0.99 | percentage | yes | yes | no |
| raft_thread_fsm_saturation_perc | global | quantile_0.5, quantile_0.9, quantile_0.99 | percentage | yes | yes | no |
| raft_fsm_last_restore_duration | global | last_restore_duration | ms | yes | yes | no |
| raft_leader_oldest_log_age | global | oldest_log_age | seconds | yes | no | no |
| raft_rpc_install_snapshot_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | no | yes | no |
| raft_boltdb_freelist_bytes | global | freelist | bytes | yes | yes | no |
| raft_boltdb_logs_per_batch_rate | global | written | logs/s | yes | yes | no |
| raft_boltdb_store_logs_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | yes | no |

## Configuration

Expand Down
82 changes: 76 additions & 6 deletions modules/consul/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@ const (

prioServerLeadershipStatus
prioRaftLeaderLastContactTime
prioRaftFollowerLastContactLeaderTime
prioRaftLeaderElections
prioRaftLeadershipTransitions

prioAutopilotHealthStatus
prioAutopilotClusterHealthStatus
prioAutopilotFailureTolerance
prioAutopilotServerHealthStatus
prioAutopilotServerStableTime
prioAutopilotServerSerfStatus
prioAutopilotServerVoterStatus

prioRPCRequests
prioRPCRequestsExceeded
Expand Down Expand Up @@ -68,6 +73,7 @@ var (
raftLeaderOldestLogAgeChart.Copy(),
}
serverFollowerCharts = module.Charts{
raftFollowerLastContactLeaderTimeChart.Copy(),
raftRPCInstallSnapshotTimeChart.Copy(),
}
serverCommonCharts = module.Charts{
Expand All @@ -76,8 +82,12 @@ var (
txnApplyTimeChart.Copy(),
txnApplyOperationsRateChart.Copy(),

autopilotHealthStatusChart.Copy(),
autopilotClusterHealthStatusChart.Copy(),
autopilotFailureTolerance.Copy(),
autopilotServerHealthStatusChart.Copy(),
autopilotServerStableTimeChart.Copy(),
autopilotServerSerfStatusChart.Copy(),
autopilotServerVoterStatusChart.Copy(),

raftLeaderElectionsRateChart.Copy(),
raftLeadershipTransitionsRateChart.Copy(),
Expand Down Expand Up @@ -175,21 +185,21 @@ var (
},
}

autopilotHealthStatusChart = module.Chart{
autopilotClusterHealthStatusChart = module.Chart{
ID: "autopilot_health_status",
Title: "Autopilot health status",
Title: "Autopilot cluster health status",
Units: "status",
Fam: "autopilot",
Ctx: "consul.autopilot_health_status",
Priority: prioAutopilotHealthStatus,
Priority: prioAutopilotClusterHealthStatus,
Dims: module.Dims{
{ID: "autopilot_healthy_yes", Name: "healthy"},
{ID: "autopilot_healthy_no", Name: "unhealthy"},
},
}
autopilotFailureTolerance = module.Chart{
ID: "autopilot_failure_tolerance",
Title: "Autopilot failure tolerance",
Title: "Autopilot cluster failure tolerance",
Units: "servers",
Fam: "autopilot",
Ctx: "consul.autopilot_failure_tolerance",
Expand All @@ -198,6 +208,55 @@ var (
{ID: "autopilot_failure_tolerance", Name: "failure_tolerance"},
},
}
autopilotServerHealthStatusChart = module.Chart{
ID: "autopilot_server_health_status",
Title: "Autopilot server health status",
Units: "status",
Fam: "autopilot",
Ctx: "consul.autopilot_server_health_status",
Priority: prioAutopilotServerHealthStatus,
Dims: module.Dims{
{ID: "autopilot_server_healthy_yes", Name: "healthy"},
{ID: "autopilot_server_healthy_no", Name: "unhealthy"},
},
}
autopilotServerStableTimeChart = module.Chart{
ID: "autopilot_server_stable_time",
Title: "Autopilot server stable time",
Units: "seconds",
Fam: "autopilot",
Ctx: "consul.autopilot_server_stable_time",
Priority: prioAutopilotServerStableTime,
Dims: module.Dims{
{ID: "autopilot_server_stable_time", Name: "stable"},
},
}
autopilotServerSerfStatusChart = module.Chart{
ID: "autopilot_server_serf_status",
Title: "Autopilot server Serf status",
Units: "status",
Fam: "autopilot",
Ctx: "consul.autopilot_server_serf_status",
Priority: prioAutopilotServerSerfStatus,
Dims: module.Dims{
{ID: "autopilot_server_sefStatus_alive", Name: "alive"},
{ID: "autopilot_server_sefStatus_failed", Name: "failed"},
{ID: "autopilot_server_sefStatus_left", Name: "left"},
{ID: "autopilot_server_sefStatus_none", Name: "none"},
},
}
autopilotServerVoterStatusChart = module.Chart{
ID: "autopilot_server_voter_status",
Title: "Autopilot server Raft voting membership",
Units: "status",
Fam: "autopilot",
Ctx: "consul.autopilot_server_voter_status",
Priority: prioAutopilotServerVoterStatus,
Dims: module.Dims{
{ID: "autopilot_server_voter_yes", Name: "voter"},
{ID: "autopilot_server_voter_no", Name: "not_voter"},
},
}

raftLeaderLastContactTimeChart = module.Chart{
ID: "raft_leader_last_contact_time",
Expand All @@ -212,6 +271,17 @@ var (
{ID: "raft_leader_lastContact_quantile=0.99", Name: "quantile_0.99", Div: precision * precision},
},
}
raftFollowerLastContactLeaderTimeChart = module.Chart{
ID: "raft_follower_last_contact_leader_time",
Title: "Raft follower last contact with the leader time",
Units: "ms",
Fam: "leadership changes",
Ctx: "consul.raft_follower_last_contact_leader_time",
Priority: prioRaftFollowerLastContactLeaderTime,
Dims: module.Dims{
{ID: "autopilot_server_lastContact_leader", Name: "leader_last_contact"},
},
}
raftLeaderElectionsRateChart = module.Chart{
ID: "raft_leader_elections_rate",
Title: "Raft leader elections rate",
Expand Down
6 changes: 6 additions & 0 deletions modules/consul/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ func (c *Consul) collect() (map[string]int64, error) {
return nil, err
}

if c.cfg.Config.Server {
if err := c.collectAutopilotHealth(mx); err != nil {
return nil, err
}
}

if c.isTelemetryPrometheusEnabled() {
if err := c.collectMetricsPrometheus(mx); err != nil {
return nil, err
Expand Down
58 changes: 58 additions & 0 deletions modules/consul/collect_autopilot.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// SPDX-License-Identifier: GPL-3.0-or-later

package consul

import "time"

const (
// https://developer.hashicorp.com/consul/api-docs/operator/autopilot#read-health
urlPathOperationAutopilotHealth = "/v1/operator/autopilot/health"
)

type autopilotHealth struct {
Servers []struct {
ID string
SerfStatus string
Leader bool
LastContact string
Healthy bool
Voter bool
StableSince time.Time
}
}

func (c *Consul) collectAutopilotHealth(mx map[string]int64) error {
var health autopilotHealth

if err := c.doOKDecode(urlPathOperationAutopilotHealth, &health); err != nil {
return err
}

for _, srv := range health.Servers {
c.Infof("my id: '%s', compare with: '%s'", c.cfg.Config.NodeID, srv.ID)
if srv.ID == c.cfg.Config.NodeID {
// SerfStatus: alive, left, failed or none:
// https://github.com/hashicorp/consul/blob/c7ef04c5979dbc311ff3c67b7bf3028a93e8b0f1/agent/consul/operator_autopilot_endpoint.go#L124-L133
mx["autopilot_server_sefStatus_alive"] = boolToInt(srv.SerfStatus == "alive")
mx["autopilot_server_sefStatus_left"] = boolToInt(srv.SerfStatus == "left")
mx["autopilot_server_sefStatus_failed"] = boolToInt(srv.SerfStatus == "failed")
mx["autopilot_server_sefStatus_none"] = boolToInt(srv.SerfStatus == "none")
mx["autopilot_server_healthy_yes"] = boolToInt(srv.Healthy)
mx["autopilot_server_healthy_no"] = boolToInt(!srv.Healthy)
mx["autopilot_server_voter_yes"] = boolToInt(srv.Voter)
mx["autopilot_server_voter_no"] = boolToInt(!srv.Voter)
if srv.Healthy {
mx["autopilot_server_stable_time"] = int64(time.Now().Sub(srv.StableSince).Seconds())
}
if !srv.Leader {
if v, err := time.ParseDuration(srv.LastContact); err == nil {
mx["autopilot_server_lastContact_leader"] = v.Milliseconds()
}
}

break
}
}

return nil
}
Loading