open-telemetry · codeboten · Jun 20, 2022 · Jun 14, 2022 · Jun 14, 2022 · Jun 15, 2022
@@ -28,6 +28,7 @@
 - `examples`: Add an example for scraping Couchbase metrics (#10894)
 - `filestorageextension`: Add background compaction capability (#9327)
 - `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845)
+- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044)
 - `mongodbreceiver`: Add integration test for mongodb receiver (#10864)
 - `mezmoexporter`: add logging for HTTP errors (#10875)
 - `signalfxexporter`: Enable the exporting of seven Kubernetes metrics used in Splunk/SignalFx content by default (#11032)

@@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
 		return "", nil
 	}
 
-	min, max := estimateHistMinMax(dp)
+	min, max, sum := histDataPointToSummary(dp)
 
 	dm, err := dtMetric.NewMetric(
 		name,
 		dtMetric.WithPrefix(prefix),
 		dtMetric.WithDimensions(dims),
 		dtMetric.WithTimestamp(dp.Timestamp().AsTime()),
-		dtMetric.WithFloatSummaryValue(
-			min,
-			max,
-			dp.Sum(),
-			int64(dp.Count()),
-		),
+		dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())),
 	)
 
 	if err != nil {
@@ -58,64 +53,117 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
 	return dm.Serialize()
 }
 
-// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
-func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
+// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
+// It MAY NOT be called with a data point with dp.Count() == 0.
+func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
 	bounds := dp.MExplicitBounds()
 	counts := dp.MBucketCounts()
 
-	// shortcut in the case both max and min are provided
-	if dp.HasMin() && dp.HasMax() {
-		return dp.Min(), dp.Max()
+	// shortcut if min, max, and sum are provided
+	if dp.HasMin() && dp.HasMax() && dp.HasSum() {
+		return dp.Min(), dp.Max(), dp.Sum()
 	}
 
-	// Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket
-	minIdx, maxIdx := -1, -1
-	for y := 0; y < len(counts); y++ {
-		if counts[y] > 0 {
-			if minIdx == -1 {
-				minIdx = y
-			}
-			maxIdx = y
-		}
+	// a single-bucket histogram is a special case
+	if len(counts) == 1 {
+		return estimateSingleBucketHistogram(dp)
 	}
 
-	if minIdx == -1 || maxIdx == -1 {
-		return 0, 0
-	}
+	// If any of min, max, sum is not provided in the data point,
+	// loop through the buckets to estimate them.
+	// All three values are estimated in order to avoid looping multiple times
+	// or complicating the loop with branches. After the loop, estimates
+	// will be overridden with any values provided by the data point.
+	foundNonEmptyBucket := false
+	var min, max, sum float64 = 0, 0, 0
+
+	// Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets
+	for i := 0; i < len(counts); i++ {
+		// empty bucket
+		if counts[i] == 0 {
+			continue
+		}
 
-	var min, max float64
+		// range for bucket counts[i] is bounds[i-1] to bounds[i]
 
-	if dp.HasMin() {
-		min = dp.Min()
-	} else {
-		// Use lower bound for min unless it is the first bucket which has no lower bound, then use upper
-		if minIdx == 0 {
-			min = bounds[minIdx]
+		// min estimation
+		if !foundNonEmptyBucket {
+			foundNonEmptyBucket = true
+			if i == 0 {
+				// if we're in the first bucket, the best estimate we can make for min is the upper bound
+				min = bounds[i]
+			} else {
+				min = bounds[i-1]
+			}
+		}
+
+		// max estimation
+		if i == len(counts)-1 {
+			// if we're in the last bucket, the best estimate we can make for max is the lower bound
+			max = bounds[i-1]
 		} else {
-			min = bounds[minIdx-1]
+			max = bounds[i]
+		}
+
+		// sum estimation
+		switch i {
+		case 0:
+			// in the first bucket, estimate sum using the upper bound
+			sum += float64(counts[i]) * bounds[i]
+		case len(counts) - 1:
+			// in the last bucket, estimate sum using the lower bound
+			sum += float64(counts[i]) * bounds[i-1]
+		default:
+			// in any other bucket, estimate sum using the bucket midpoint
+			sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
 		}
 	}
 
+	// Override estimates with any values provided by the data point
+	if dp.HasMin() {
+		min = dp.Min()
+	}
 	if dp.HasMax() {
 		max = dp.Max()
-	} else {
-		// Use upper bound for max unless it is the last bucket which has no upper bound, then use lower
-		if maxIdx == len(counts)-1 {
-			max = bounds[maxIdx-1]
-		} else {
-			max = bounds[maxIdx]
-		}
+	}
+	if dp.HasSum() {
+		sum = dp.Sum()
 	}
 
 	// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
 	// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
-	avg := dp.Sum() / float64(dp.Count())
+	// dp.Count() will never be zero
+	avg := sum / float64(dp.Count())
 	if min > avg {
 		min = avg
 	}
 	if max < avg {
 		max = avg
 	}
 
-	return min, max
+	return min, max, sum
+}
+
+func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
+	min, max, sum := 0.0, 0.0, 0.0
+
+	if dp.HasSum() {
+		sum = dp.Sum()
+	}
+
+	mean := sum / float64(dp.Count())
+
+	if dp.HasMin() {
+		min = dp.Min()
+	} else {
+		min = mean
+	}
+
+	if dp.HasMax() {
+		max = dp.Max()
+	} else {
+		max = mean
+	}
+
+	return min, max, sum
 }
@@ -131,4 +131,197 @@ func Test_serializeHistogram(t *testing.T) {
 		assert.NoError(t, err)
 		assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got)
 	})
+
+	t.Run("when min is not provided it should be estimated", func(t *testing.T) {
+		t.Run("values between first two boundaries", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
+			hist.SetCount(6)
+			hist.SetSum(21.2)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.0, min, "use bucket min")
+		})
+
+		t.Run("first bucket has value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
+			hist.SetCount(8)
+			hist.SetSum(34.5)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf")
+		})
+
+		t.Run("only the first bucket has values, use the mean", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0})
+			hist.SetCount(3)
+			hist.SetSum(0.75)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 0.25, min)
+		})
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{4})
+			hist.SetCount(4)
+			hist.SetSum(8.8)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 2.2, min, "calculate the mean as min value")
+		})
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{1})
+			hist.SetCount(1)
+			hist.SetSum(1.2)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.2, min, "calculate the mean as min value")
+		})
+		t.Run("only the last bucket has a value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3})
+			hist.SetCount(3)
+			hist.SetSum(15.6)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, min, "use the lower bound")
+		})
+	})
+
+	t.Run("when max is not provided it should be estimated", func(t *testing.T) {
+		t.Run("values between the last two boundaries", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
+			hist.SetSum(21.2)
+			hist.SetCount(6)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use bucket max")
+		})
+
+		t.Run("last bucket has value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
+			hist.SetSum(34.5)
+			hist.SetCount(8)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf")
+		})
+
+		t.Run("only the last bucket has values", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2})
+			hist.SetSum(20.2)
+			hist.SetCount(2)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated")
+		})
+
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{4})
+			hist.SetSum(8.8)
+			hist.SetCount(4)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 2.2, max, "calculate the mean as max value")
+		})
+
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{1})
+			hist.SetSum(1.2)
+			hist.SetCount(1)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.2, max, "calculate the mean as max value")
+		})
+
+		t.Run("max is larger than sum", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{0, 5})
+			hist.SetMBucketCounts([]uint64{0, 2, 0})
+			hist.SetSum(2.3)
+			hist.SetCount(2)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use the estimated boundary")
+		})
+	})
+
+	t.Run("when sum is not provided it should be estimated", func(t *testing.T) {
+		t.Run("single bucket histogram", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{13})
+			hist.SetCount(6)
+
+			_, _, sum := histDataPointToSummary(hist)
+
+			assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])")
+		})
+
+		t.Run("data in bounded buckets", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0})
+			hist.SetCount(6)
+
+			_, _, sum := histDataPointToSummary(hist)
+
+			assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints")
+		})
+
+		t.Run("data in unbounded buckets", func(t *testing.T) {
+			t.Run("first bucket", func(t *testing.T) {
+				hist := pmetric.NewHistogramDataPoint()
+				hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+				hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0})
+				hist.SetCount(6)
+
+				_, _, sum := histDataPointToSummary(hist)
+
+				assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound")
+			})
+
+			t.Run("last bucket", func(t *testing.T) {
+				hist := pmetric.NewHistogramDataPoint()
+				hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+				hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2})
+				hist.SetCount(6)
+
+				_, _, sum := histDataPointToSummary(hist)
+
+				assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound")
+			})
+		})
+	})
 }