diff --git a/common/metrics/config.go b/common/metrics/config.go index 7d762fa0d90..dd7368c1b66 100644 --- a/common/metrics/config.go +++ b/common/metrics/config.go @@ -57,7 +57,7 @@ type ( ClientConfig struct { // Tags is the set of key-value pairs to be reported as part of every metric Tags map[string]string `yaml:"tags"` - // IgnoreTags is a map from tag name string to tag values string list. + // ExcludeTags is a map from tag name string to tag values string list. // Each value present in keys will have relevant tag value replaced with "_tag_excluded_" // Each value in values list will white-list tag values to be reported as usual. ExcludeTags map[string][]string `yaml:"excludeTags"` diff --git a/common/metrics/metricstest/metricstest.go b/common/metrics/metricstest/metricstest.go index 777d3a8f0f7..186999386f3 100644 --- a/common/metrics/metricstest/metricstest.go +++ b/common/metrics/metricstest/metricstest.go @@ -25,6 +25,7 @@ package metricstest import ( + "errors" "fmt" "net/http" "net/http/httptest" @@ -36,7 +37,9 @@ import ( "github.com/prometheus/common/expfmt" exporters "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/unit" sdkmetrics "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/aggregation" "golang.org/x/exp/maps" "go.temporal.io/server/common/log" @@ -55,29 +58,58 @@ type ( sampleValue float64 } + HistogramBucket struct { + value float64 + upperBound float64 + } + + histogramSample struct { + metricType dto.MetricType + labelValues map[string]string + buckets []HistogramBucket + } + Snapshot struct { - samples map[string]sample + samples map[string]sample + histogramSamples map[string]histogramSample } ) -func MustNewHandler(logger log.Logger) *Handler { - h, err := NewHandler(logger) - if err != nil { - panic(err) - } - return h -} +// Potential errors that the test handler can return trying to find a metric to return. +var ( + ErrMetricNotFound = errors.New("metric not found") + ErrMetricTypeMismatch = errors.New("metric is not the expected type") + ErrMetricLabelMismatch = errors.New("metric labels do not match expected labels") +) -func NewHandler(logger log.Logger) (*Handler, error) { +func NewHandler(logger log.Logger, clientConfig metrics.ClientConfig) (*Handler, error) { registry := prometheus.NewRegistry() exporter, err := exporters.New(exporters.WithRegisterer(registry)) if err != nil { return nil, err } - provider := sdkmetrics.NewMeterProvider(sdkmetrics.WithReader(exporter)) + // Set any custom histogram bucket configuration. + var views []sdkmetrics.View + for _, u := range []string{metrics.Dimensionless, metrics.Bytes, metrics.Milliseconds} { + views = append(views, sdkmetrics.NewView( + sdkmetrics.Instrument{ + Kind: sdkmetrics.InstrumentKindHistogram, + Unit: unit.Unit(u), + }, + sdkmetrics.Stream{ + Aggregation: aggregation.ExplicitBucketHistogram{ + Boundaries: clientConfig.PerUnitHistogramBoundaries[u], + }, + }, + )) + } + provider := sdkmetrics.NewMeterProvider( + sdkmetrics.WithReader(exporter), + sdkmetrics.WithView(views...), + ) meter := provider.Meter("temporal") - clientConfig := metrics.ClientConfig{} + otelHandler := metrics.NewOtelMetricsHandler(logger, &otelProvider{meter: meter}, clientConfig) metricsHandler := &Handler{ Handler: otelHandler, @@ -102,41 +134,56 @@ func (h *Handler) Snapshot() (Snapshot, error) { return Snapshot{}, err } samples := map[string]sample{} + histogramSamples := map[string]histogramSample{} for name, family := range families { for _, m := range family.GetMetric() { - labelvalues := map[string]string{} - for _, lp := range m.GetLabel() { - labelvalues[lp.GetName()] = lp.GetValue() - } - // This only records the last sample if there - // are multiple samples recorded. - switch family.GetType() { - default: - // Not yet supporting histogram, summary, untyped. - case dto.MetricType_COUNTER: - samples[name] = sample{ - metricType: family.GetType(), - labelValues: labelvalues, - sampleValue: m.Counter.GetValue(), - } - case dto.MetricType_GAUGE: - samples[name] = sample{ - metricType: family.GetType(), - labelValues: labelvalues, - sampleValue: m.Gauge.GetValue(), - } - } + collectSamples(name, family, m, samples, histogramSamples) } } - return Snapshot{samples: samples}, nil + return Snapshot{ + samples: samples, + histogramSamples: histogramSamples, + }, nil } -func (h *Handler) MustSnapshot() Snapshot { - s, err := h.Snapshot() - if err != nil { - panic(err) +func collectSamples(name string, family *dto.MetricFamily, m *dto.Metric, samples map[string]sample, histogramSamples map[string]histogramSample) { + labelvalues := map[string]string{} + for _, lp := range m.GetLabel() { + labelvalues[lp.GetName()] = lp.GetValue() + } + // This only records the last sample if there + // are multiple samples recorded. + switch family.GetType() { + default: + // Not yet supporting summary, untyped. + case dto.MetricType_HISTOGRAM: + buckets := m.Histogram.GetBucket() + hbs := []HistogramBucket{} + for _, bucket := range buckets { + hb := HistogramBucket{ + value: float64(bucket.GetCumulativeCount()), + upperBound: bucket.GetUpperBound(), + } + hbs = append(hbs, hb) + } + histogramSamples[name] = histogramSample{ + metricType: family.GetType(), + labelValues: labelvalues, + buckets: hbs, + } + case dto.MetricType_COUNTER: + samples[name] = sample{ + metricType: family.GetType(), + labelValues: labelvalues, + sampleValue: m.Counter.GetValue(), + } + case dto.MetricType_GAUGE: + samples[name] = sample{ + metricType: family.GetType(), + labelValues: labelvalues, + sampleValue: m.Gauge.GetValue(), + } } - return s } var _ metrics.OpenTelemetryProvider = (*otelProvider)(nil) @@ -158,13 +205,13 @@ func (s Snapshot) getValue(name string, metricType dto.MetricType, tags ...metri } sample, ok := s.samples[name] if !ok { - return 0, fmt.Errorf("metric %s not found", name) + return 0, fmt.Errorf("%w: %q", ErrMetricNotFound, name) } if sample.metricType != metricType { - return 0, fmt.Errorf("metric %s not a %s type", name, metricType.String()) + return 0, fmt.Errorf("%w: %q is a %s, not a %s", ErrMetricTypeMismatch, name, sample.metricType, metricType) } if !maps.Equal(sample.labelValues, labelValues) { - return 0, fmt.Errorf("metric %s label mismatch, has %v, asked for %v", name, sample.labelValues, labelValues) + return 0, fmt.Errorf("%w: %q has %v, asked for %v", ErrMetricLabelMismatch, name, sample.labelValues, labelValues) } return sample.sampleValue, nil } @@ -173,30 +220,39 @@ func (s Snapshot) Counter(name string, tags ...metrics.Tag) (float64, error) { return s.getValue(name, dto.MetricType_COUNTER, tags...) } -func (s Snapshot) MustCounter(name string, tags ...metrics.Tag) float64 { - v, err := s.Counter(name, tags...) - if err != nil { - panic(err) - } - return v -} - func (s Snapshot) Gauge(name string, tags ...metrics.Tag) (float64, error) { return s.getValue(name, dto.MetricType_GAUGE, tags...) } -func (s Snapshot) MustGauge(name string, tags ...metrics.Tag) float64 { - v, err := s.Gauge(name, tags...) - if err != nil { - panic(err) +func (s Snapshot) Histogram(name string, tags ...metrics.Tag) ([]HistogramBucket, error) { + labelValues := map[string]string{} + for _, tag := range tags { + labelValues[tag.Key()] = tag.Value() + } + + sample, ok := s.histogramSamples[name] + if !ok { + return nil, fmt.Errorf("%w: %q", ErrMetricNotFound, name) + } + if sample.metricType != dto.MetricType_HISTOGRAM { + return nil, fmt.Errorf("%w: %q is a %s, not a %s", ErrMetricTypeMismatch, name, sample.metricType, dto.MetricType_HISTOGRAM) } - return v + if !maps.Equal(sample.labelValues, labelValues) { + return nil, fmt.Errorf("%w: %q has %v, asked for %v", ErrMetricLabelMismatch, name, sample.labelValues, labelValues) + } + return sample.buckets, nil } func (s Snapshot) String() string { var b strings.Builder for n, s := range s.samples { - b.WriteString(fmt.Sprintf("%v %v %v %v\n", n, s.labelValues, s.sampleValue, s.metricType)) + _, _ = b.WriteString(fmt.Sprintf("%v %v %v %v\n", n, s.labelValues, s.sampleValue, s.metricType)) + } + for n, s := range s.histogramSamples { + _, _ = b.WriteString(fmt.Sprintf("%v %v %v\n", n, s.labelValues, s.metricType)) + for _, bucket := range s.buckets { + _, _ = b.WriteString(fmt.Sprintf(" %v: %v \n", bucket.upperBound, bucket.value)) + } } return b.String() } diff --git a/common/metrics/metricstest/metricstest_test.go b/common/metrics/metricstest/metricstest_test.go index 880986b007a..5d119eb3ae7 100644 --- a/common/metrics/metricstest/metricstest_test.go +++ b/common/metrics/metricstest/metricstest_test.go @@ -25,8 +25,10 @@ package metricstest import ( + "math" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.temporal.io/server/common/log" @@ -34,8 +36,10 @@ import ( ) func TestBasic(t *testing.T) { + t.Parallel() logger := log.NewTestLogger() - handler := MustNewHandler(logger) + handler, err := NewHandler(logger, metrics.ClientConfig{}) + require.NoError(t, err) counterName := "counter1" counterTags := []metrics.Tag{ @@ -51,8 +55,12 @@ func TestBasic(t *testing.T) { counter.Record(1) counter.Record(1) - s1 := handler.MustSnapshot() - require.Equal(t, float64(2), s1.MustCounter(counterName+"_total", expectedCounterTags...)) + s1, err := handler.Snapshot() + require.NoError(t, err) + + counterVal, err := s1.Counter(counterName+"_total", expectedCounterTags...) + require.NoError(t, err) + assert.Equal(t, float64(2), counterVal) gaugeName := "gauge1" gaugeTags := []metrics.Tag{ @@ -64,7 +72,57 @@ func TestBasic(t *testing.T) { gauge.Record(-2) gauge.Record(10) - s2 := handler.MustSnapshot() - require.Equal(t, float64(2), s2.MustCounter(counterName+"_total", expectedCounterTags...)) - require.Equal(t, float64(10), s2.MustGauge(gaugeName, expectedGaugeTags...)) + s2, err := handler.Snapshot() + require.NoError(t, err) + + counterVal, err = s2.Counter(counterName+"_total", expectedCounterTags...) + require.NoError(t, err) + assert.Equal(t, float64(2), counterVal) + + gaugeVal, err := s2.Gauge(gaugeName, expectedGaugeTags...) + require.NoError(t, err) + assert.Equal(t, float64(10), gaugeVal) +} + +func TestHistogram(t *testing.T) { + t.Parallel() + logger := log.NewTestLogger() + handler, err := NewHandler(logger, metrics.ClientConfig{ + PerUnitHistogramBoundaries: map[string][]float64{ + metrics.Dimensionless: { + 1, + 2, + 5, + }, + }, + }) + require.NoError(t, err) + + histogramName := "histogram1" + histogramTags := []metrics.Tag{ + metrics.StringTag("l2", "v2"), + metrics.StringTag("l1", "v1"), + } + expectedSystemTags := []metrics.Tag{ + metrics.StringTag("otel_scope_name", "temporal"), + metrics.StringTag("otel_scope_version", ""), + } + expectedHistogramTags := append(expectedSystemTags, histogramTags...) + histogram := handler.WithTags(histogramTags...).Histogram(histogramName, metrics.Dimensionless) + histogram.Record(1) + histogram.Record(3) + + s1, err := handler.Snapshot() + require.NoError(t, err) + + expectedBuckets := []HistogramBucket{ + {value: 1, upperBound: 1}, + {value: 1, upperBound: 2}, + {value: 2, upperBound: 5}, + {value: 2, upperBound: math.Inf(1)}, + } + + histogramVal, err := s1.Histogram(histogramName+"_ratio", expectedHistogramTags...) + require.NoError(t, err) + assert.Equal(t, expectedBuckets, histogramVal) }