Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(dynatraceexporter): provide better estimated summaries for partial histograms #11044

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- `examples`: Add an example for scraping Couchbase metrics (#10894)
- `filestorageextension`: Add background compaction capability (#9327)
- `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845)
- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044)
- `mongodbreceiver`: Add integration test for mongodb receiver (#10864)
- `mezmoexporter`: add logging for HTTP errors (#10875)
- `signalfxexporter`: Enable the exporting of seven Kubernetes metrics used in Splunk/SignalFx content by default (#11032)
Expand Down
130 changes: 89 additions & 41 deletions exporter/dynatraceexporter/internal/serialization/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return "", nil
}

min, max := estimateHistMinMax(dp)
min, max, sum := histDataPointToSummary(dp)

dm, err := dtMetric.NewMetric(
name,
dtMetric.WithPrefix(prefix),
dtMetric.WithDimensions(dims),
dtMetric.WithTimestamp(dp.Timestamp().AsTime()),
dtMetric.WithFloatSummaryValue(
min,
max,
dp.Sum(),
int64(dp.Count()),
),
dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())),
)

if err != nil {
Expand All @@ -58,64 +53,117 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return dm.Serialize()
}

// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
// It MAY NOT be called with a data point with dp.Count() == 0.
func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
bounds := dp.MExplicitBounds()
counts := dp.MBucketCounts()

// shortcut in the case both max and min are provided
if dp.HasMin() && dp.HasMax() {
return dp.Min(), dp.Max()
// shortcut if min, max, and sum are provided
if dp.HasMin() && dp.HasMax() && dp.HasSum() {
return dp.Min(), dp.Max(), dp.Sum()
}

// Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket
minIdx, maxIdx := -1, -1
for y := 0; y < len(counts); y++ {
if counts[y] > 0 {
if minIdx == -1 {
minIdx = y
}
maxIdx = y
}
// a single-bucket histogram is a special case
if len(counts) == 1 {
return estimateSingleBucketHistogram(dp)
}

if minIdx == -1 || maxIdx == -1 {
return 0, 0
}
// If any of min, max, sum is not provided in the data point,
// loop through the buckets to estimate them.
// All three values are estimated in order to avoid looping multiple times
// or complicating the loop with branches. After the loop, estimates
// will be overridden with any values provided by the data point.
foundNonEmptyBucket := false
var min, max, sum float64 = 0, 0, 0

// Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets
for i := 0; i < len(counts); i++ {
// empty bucket
if counts[i] == 0 {
continue
}

var min, max float64
// range for bucket counts[i] is bounds[i-1] to bounds[i]

if dp.HasMin() {
min = dp.Min()
} else {
// Use lower bound for min unless it is the first bucket which has no lower bound, then use upper
if minIdx == 0 {
min = bounds[minIdx]
// min estimation
if !foundNonEmptyBucket {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum.. not sure I get the motive of this var here? Couldn't we just check if it's i == 0 it will only go inside once..🤔

Copy link
Member Author

@dyladan dyladan Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first non-empty bucket might not be the first bucket.

foundNonEmptyBucket = true
if i == 0 {
// if we're in the first bucket, the best estimate we can make for min is the upper bound
min = bounds[i]
} else {
min = bounds[i-1]
}
}

// max estimation
if i == len(counts)-1 {
dyladan marked this conversation as resolved.
Show resolved Hide resolved
// if we're in the last bucket, the best estimate we can make for max is the lower bound
max = bounds[i-1]
} else {
min = bounds[minIdx-1]
max = bounds[i]
}

// sum estimation
switch i {
case 0:
// in the first bucket, estimate sum using the upper bound
sum += float64(counts[i]) * bounds[i]
case len(counts) - 1:
// in the last bucket, estimate sum using the lower bound
sum += float64(counts[i]) * bounds[i-1]
default:
// in any other bucket, estimate sum using the bucket midpoint
sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
}
}

// Override estimates with any values provided by the data point
if dp.HasMin() {
dyladan marked this conversation as resolved.
Show resolved Hide resolved
min = dp.Min()
}
if dp.HasMax() {
max = dp.Max()
} else {
// Use upper bound for max unless it is the last bucket which has no upper bound, then use lower
if maxIdx == len(counts)-1 {
max = bounds[maxIdx-1]
} else {
max = bounds[maxIdx]
}
}
if dp.HasSum() {
sum = dp.Sum()
}

// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
avg := dp.Sum() / float64(dp.Count())
// dp.Count() will never be zero
avg := sum / float64(dp.Count())
if min > avg {
min = avg
}
if max < avg {
max = avg
}

return min, max
return min, max, sum
}

func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
min, max, sum := 0.0, 0.0, 0.0

if dp.HasSum() {
sum = dp.Sum()
}

mean := sum / float64(dp.Count())
dyladan marked this conversation as resolved.
Show resolved Hide resolved
dyladan marked this conversation as resolved.
Show resolved Hide resolved

if dp.HasMin() {
min = dp.Min()
} else {
min = mean
}

if dp.HasMax() {
max = dp.Max()
} else {
max = mean
}

return min, max, sum
}
193 changes: 193 additions & 0 deletions exporter/dynatraceexporter/internal/serialization/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,197 @@ func Test_serializeHistogram(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got)
})

dyladan marked this conversation as resolved.
Show resolved Hide resolved
t.Run("when min is not provided it should be estimated", func(t *testing.T) {
t.Run("values between first two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetCount(6)
hist.SetSum(21.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use bucket min")
})

t.Run("first bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetCount(8)
hist.SetSum(34.5)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf")
})

t.Run("only the first bucket has values, use the mean", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0})
hist.SetCount(3)
hist.SetSum(0.75)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 0.25, min)
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetCount(4)
hist.SetSum(8.8)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, min, "calculate the mean as min value")
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetCount(1)
hist.SetSum(1.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, min, "calculate the mean as min value")
})
t.Run("only the last bucket has a value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3})
hist.SetCount(3)
hist.SetSum(15.6)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, min, "use the lower bound")
})
})

t.Run("when max is not provided it should be estimated", func(t *testing.T) {
t.Run("values between the last two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetSum(21.2)
hist.SetCount(6)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use bucket max")
})

t.Run("last bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetSum(34.5)
hist.SetCount(8)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf")
})

t.Run("only the last bucket has values", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2})
hist.SetSum(20.2)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetSum(8.8)
hist.SetCount(4)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, max, "calculate the mean as max value")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetSum(1.2)
hist.SetCount(1)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, max, "calculate the mean as max value")
})

t.Run("max is larger than sum", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{0, 5})
hist.SetMBucketCounts([]uint64{0, 2, 0})
hist.SetSum(2.3)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the estimated boundary")
})
})

t.Run("when sum is not provided it should be estimated", func(t *testing.T) {
t.Run("single bucket histogram", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{13})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])")
})

t.Run("data in bounded buckets", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints")
})

t.Run("data in unbounded buckets", func(t *testing.T) {
t.Run("first bucket", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound")
})

t.Run("last bucket", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound")
})
})
})
}