Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

Commit

Permalink
feat(consul): collect prometheus key metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 committed Dec 19, 2022
1 parent 0b0ba23 commit c269d71
Show file tree
Hide file tree
Showing 16 changed files with 4,487 additions and 583 deletions.
442 changes: 381 additions & 61 deletions modules/consul/charts.go

Large diffs are not rendered by default.

25 changes: 22 additions & 3 deletions modules/consul/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,38 @@ import (
"github.com/netdata/go.d.plugin/pkg/web"
)

const (
precision = 1000
)

func (c *Consul) collect() (map[string]int64, error) {
if c.cfg == nil {
if err := c.collectConfiguration(); err != nil {
return nil, err
}

c.addGlobalChartsOnce.Do(c.addGlobalCharts)
}

mx := make(map[string]int64)

if err := c.collectAgentChecks(mx); err != nil {
if err := c.collectChecks(mx); err != nil {
return nil, err
}
if err := c.collectAgentMetrics(mx); err != nil {
return nil, err

if c.isTelemetryPrometheusEnabled() {
if err := c.collectMetricsPrometheus(mx); err != nil {
return nil, err
}
}

return mx, nil
}

func (c *Consul) isTelemetryPrometheusEnabled() bool {
return c.cfg.DebugConfig.Telemetry.PrometheusOpts.Expiration != "0s"
}

func (c *Consul) doOKDecode(urlPath string, in interface{}) error {
req, err := web.NewHTTPRequest(c.Request.Copy())
if err != nil {
Expand Down
15 changes: 6 additions & 9 deletions modules/consul/collect_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

package consul

const (
// https://www.consul.io/api-docs/agent/check#list-checks
urlPathAgentChecks = "/v1/agent/checks"
)

type agentCheck struct {
Node string
CheckID string
Expand All @@ -12,22 +17,14 @@ type agentCheck struct {
ServiceTags []string
}

// https://www.consul.io/api-docs/agent/check#list-checks
const urlPathAgentChecks = "/v1/agent/checks"

func (c *Consul) collectAgentChecks(mx map[string]int64) error {
func (c *Consul) collectChecks(mx map[string]int64) error {
var checks map[string]*agentCheck

if err := c.doOKDecode(urlPathAgentChecks, &checks); err != nil {
return err
}

for id, check := range checks {
if !c.checksSr.MatchString(id) {
c.Debugf("check with id '%s' does not match the selector ('%s'), skipping it", id, c.ChecksSelector)
continue
}

if !c.checks[id] {
c.checks[id] = true
c.addHealthCheckCharts(check)
Expand Down
54 changes: 54 additions & 0 deletions modules/consul/collect_config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// SPDX-License-Identifier: GPL-3.0-or-later

package consul

import (
"github.com/blang/semver/v4"
)

const (
// https://developer.hashicorp.com/consul/api-docs/agent#read-configuration
urlPathAgentSelf = "/v1/agent/self"
)

type consulConfig struct {
Config struct {
Datacenter string
PrimaryDatacenter string
NodeName string
NodeID string
Server bool
Version string
}
DebugConfig struct {
Telemetry struct {
MetricsPrefix string
DisableHostname bool
PrometheusOpts struct {
Expiration string
Name string
}
}
}
}

func (c *Consul) collectConfiguration() error {
var cfg consulConfig

if err := c.doOKDecode(urlPathAgentSelf, &cfg); err != nil {
return err
}

c.cfg = &cfg
c.Debugf("consul config: %+v", cfg)

ver, err := semver.New(c.cfg.Config.Version)
if err != nil {
c.Warningf("error on parsing Consul version '%s': %v", c.cfg.Config.Version, err)
return nil
}

c.version = ver

return nil
}
164 changes: 120 additions & 44 deletions modules/consul/collect_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,140 @@

package consul

type agentMetrics struct {
Gauges []struct {
Name string
Value int64
Labels map[string]string
import (
"fmt"
"math"
"strconv"
"strings"

"github.com/netdata/go.d.plugin/pkg/prometheus"
)

func (c *Consul) collectMetricsPrometheus(mx map[string]int64) error {
mfs, err := c.prom.Scrape()
if err != nil {
return err
}

// Key Metrics (https://developer.hashicorp.com/consul/docs/agent/telemetry#key-metrics)

if c.cfg.Config.Server {
c.collectSummary(mx, mfs, "raft_thread_main_saturation")
c.collectSummary(mx, mfs, "raft_thread_fsm_saturation")
c.collectSummary(mx, mfs, "raft_boltdb_logsPerBatch")
c.collectSummary(mx, mfs, "kvs_apply")
c.collectSummary(mx, mfs, "txn_apply")
c.collectSummary(mx, mfs, "raft_boltdb_storeLogs")
c.collectSummary(mx, mfs, "raft_rpc_installSnapshot") // make sense for followers only
c.collectSummary(mx, mfs, "raft_commitTime") // make sense for leader only
c.collectSummary(mx, mfs, "raft_leader_lastContact") // make sense for leader only

c.collectCounter(mx, mfs, "raft_apply", precision) // make sense for leader only
c.collectCounter(mx, mfs, "raft_state_candidate", 1)
c.collectCounter(mx, mfs, "raft_state_leader", 1)

c.collectGaugeBool(mx, mfs, "autopilot_healthy")
c.collectGaugeBool(mx, mfs, "server_isLeader")
c.collectGauge(mx, mfs, "autopilot_failure_tolerance")
c.collectGauge(mx, mfs, "raft_fsm_lastRestoreDuration")
c.collectGauge(mx, mfs, "raft_leader_oldestLogAge") // make sense for leader only
c.collectGauge(mx, mfs, "raft_boltdb_freelistBytes")
}

c.collectCounter(mx, mfs, "client_rpc", 1)
c.collectCounter(mx, mfs, "client_rpc_exceeded", 1)
c.collectCounter(mx, mfs, "client_rpc_failed", 1)
c.collectGauge(mx, mfs, "runtime_alloc_bytes")
c.collectGauge(mx, mfs, "runtime_sys_bytes")
c.collectGauge(mx, mfs, "runtime_total_gc_pause_ns")

return nil
}

func (c *Consul) collectGauge(mx map[string]int64, mfs prometheus.MetricFamilies, name string) {
mf := mfs.GetGauge(c.promMetricNameWithHostname(name))
if mf == nil {
mf = mfs.GetGauge(c.promMetricName(name))
}
Counters []struct {
Name string
Count int64
Labels map[string]string
if mf == nil {
return
}
Samples []struct {
Name string
Count int64
Labels map[string]string

v := mf.Metrics()[0].Gauge().Value()

if !math.IsNaN(v) {
mx[name] = int64(v)
}
}

// https://www.consul.io/api-docs/agent#view-metrics
const urlPathAgentMetrics = "/v1/agent/metrics"
func (c *Consul) collectGaugeBool(mx map[string]int64, mfs prometheus.MetricFamilies, name string) {
mf := mfs.GetGauge(c.promMetricNameWithHostname(name))
if mf == nil {
mf = mfs.GetGauge(c.promMetricName(name))
}
if mf == nil {
return
}

v := mf.Metrics()[0].Gauge().Value()

func (c *Consul) collectAgentMetrics(mx map[string]int64) error {
var metrics agentMetrics
if !math.IsNaN(v) {
mx[name+"_yes"] = boolToInt(v == 1)
mx[name+"_no"] = boolToInt(v == 0)
}
}

if err := c.doOKDecode(urlPathAgentMetrics, &metrics); err != nil {
return err
func (c *Consul) collectCounter(mx map[string]int64, mfs prometheus.MetricFamilies, name string, mul float64) {
mf := mfs.GetCounter(c.promMetricName(name))
if mf == nil {
return
}

for _, m := range metrics.Gauges {
switch m.Name {
case "consul.server.isLeader":
mx[m.Name+".yes"] = boolToInt(m.Value == 1)
mx[m.Name+".no"] = boolToInt(m.Value != 1)
case "consul.autopilot.healthy":
mx[m.Name+".yes"] = boolToInt(m.Value == 1)
mx[m.Name+".no"] = boolToInt(m.Value != 1)
case
"consul.autopilot.failure_tolerance",
"consul.runtime.alloc_bytes",
"consul.runtime.sys_bytes",
"consul.runtime.total_gc_pause_ns":
mx[m.Name] = m.Value
}
v := mf.Metrics()[0].Counter().Value()

if !math.IsNaN(v) {
mx[name] = int64(v * mul)
}
}

for _, m := range metrics.Counters {
switch m.Name {
case "consul.client.rpc":
mx[m.Name] = m.Count
}
func (c *Consul) collectSummary(mx map[string]int64, mfs prometheus.MetricFamilies, name string) {
mf := mfs.GetSummary(c.promMetricName(name))
if mf == nil {
return
}

for _, m := range metrics.Samples {
switch m.Name {
case "consul.client.rpc":
mx[m.Name] = m.Count
m := mf.Metrics()[0]

for _, q := range m.Summary().Quantiles() {
v := q.Value()
// MaxAge is 10 seconds (hardcoded)
// https://github.com/hashicorp/go-metrics/blob/b6d5c860c07ef6eeec89f4a662c7b452dd4d0c93/prometheus/prometheus.go#L227
if math.IsNaN(v) {
v = 0
}

id := fmt.Sprintf("%s_quantile=%s", name, formatFloat(q.Quantile()))
mx[id] = int64(v * precision * precision)
}

return nil
mx[name+"_sum"] = int64(m.Summary().Sum() * precision)
mx[name+"_count"] = int64(m.Summary().Count())
}

func (c *Consul) promMetricName(name string) string {
px := c.cfg.DebugConfig.Telemetry.MetricsPrefix
return px + "_" + name
}

// controlled by 'disable_hostname'
// https://developer.hashicorp.com/consul/docs/agent/config/config-files#telemetry-disable_hostname
func (c *Consul) promMetricNameWithHostname(name string) string {
px := c.cfg.DebugConfig.Telemetry.MetricsPrefix
node := strings.ReplaceAll(c.cfg.Config.NodeName, "-", "_")

return px + "_" + node + "_" + name
}

func formatFloat(v float64) string {
return strconv.FormatFloat(v, 'f', -1, 64)
}
Loading

0 comments on commit c269d71

Please sign in to comment.