Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-22.1: sql/stats: support rowCountEq = 0 in histogram.adjustCounts #82704

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions pkg/sql/stats/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,18 @@ type histogram struct {
}

// adjustCounts adjusts the row count and number of distinct values per bucket
// based on the total row count and estimated distinct count.
// to equal the total row count and estimated distinct count. The total row
// count and estimated distinct count should not include NULL values, and the
// histogram should not contain any buckets for NULL values.
func (h *histogram) adjustCounts(
evalCtx *tree.EvalContext, rowCountTotal, distinctCountTotal float64,
) {
// Empty table cases.
if rowCountTotal <= 0 || distinctCountTotal <= 0 {
h.buckets = make([]cat.HistogramBucket, 0)
return
}

// Calculate the current state of the histogram so we can adjust it as needed.
// The number of rows and distinct values represented by the histogram should
// be adjusted so they equal rowCountTotal and distinctCountTotal.
Expand All @@ -190,13 +198,16 @@ func (h *histogram) adjustCounts(
}
}

if rowCountEq <= 0 {
panic(errors.AssertionFailedf("expected a positive value for rowCountEq"))
// If the histogram only had empty buckets, we can't adjust it.
if rowCountRange+rowCountEq <= 0 || distinctCountRange+distinctCountEq <= 0 {
h.buckets = make([]cat.HistogramBucket, 0)
return
}

// If the upper bounds account for all distinct values (as estimated by the
// sketch), make the histogram consistent by clearing the ranges and adjusting
// the NumEq values to add up to the row count.
// the NumEq values to add up to the row count. This might be the case for
// low-cardinality types like BOOL and ENUM or other low-cardinality data.
if distinctCountEq >= distinctCountTotal {
adjustmentFactorNumEq := rowCountTotal / rowCountEq
for i := range h.buckets {
Expand All @@ -210,7 +221,7 @@ func (h *histogram) adjustCounts(
// The upper bounds do not account for all distinct values, so adjust the
// NumEq values if needed so they add up to less than the row count.
remDistinctCount := distinctCountTotal - distinctCountEq
if rowCountEq+remDistinctCount >= rowCountTotal {
if rowCountEq > 0 && rowCountEq+remDistinctCount > rowCountTotal {
targetRowCountEq := rowCountTotal - remDistinctCount
adjustmentFactorNumEq := targetRowCountEq / rowCountEq
for i := range h.buckets {
Expand All @@ -230,10 +241,10 @@ func (h *histogram) adjustCounts(
lowerBound := h.buckets[0].UpperBound
upperBound := h.buckets[len(h.buckets)-1].UpperBound
if maxDistinct, ok := tree.MaxDistinctCount(evalCtx, lowerBound, upperBound); ok {
// Subtract distinctCountEq to account for the upper bounds of the
// Subtract number of buckets to account for the upper bounds of the
// buckets, along with the current range distinct count which has already
// been accounted for.
maxDistinctCountRange = float64(maxDistinct) - distinctCountEq - distinctCountRange
maxDistinctCountRange = float64(maxDistinct) - float64(len(h.buckets)) - distinctCountRange
}

// Add distinct values into the histogram if there is space. Increment the
Expand Down Expand Up @@ -278,7 +289,10 @@ func (h *histogram) adjustCounts(
)
}

// Adjust the values so the row counts and distinct counts add up correctly.
// At this point rowCountRange + rowCountEq >= distinctCountTotal but not
// necessarily rowCountTotal, so we've accounted for all distinct values, and
// any additional rows we add will be duplicate values. We can spread the
// final adjustment proportionately across both NumRange and NumEq.
adjustmentFactorDistinctRange := float64(1)
if distinctCountRange > 0 {
adjustmentFactorDistinctRange = (distinctCountTotal - distinctCountEq) / distinctCountRange
Expand Down
44 changes: 44 additions & 0 deletions pkg/sql/stats/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,50 @@ func TestAdjustCounts(t *testing.T) {
{NumRange: 1551.19, NumEq: 3447.09, DistinctRange: 450, UpperBound: f(1000)},
},
},
{ // Zero rowCount and distinctCount.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)},
},
rowCount: 0,
distinctCount: 0,
expected: []cat.HistogramBucket{},
},
{ // Negative rowCount and distinctCount.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)},
},
rowCount: -100,
distinctCount: -90,
expected: []cat.HistogramBucket{},
},
{ // Empty initial histogram.
h: []cat.HistogramBucket{},
rowCount: 1000,
distinctCount: 1000,
expected: []cat.HistogramBucket{},
},
{ // Empty bucket in initial histogram.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
},
rowCount: 99,
distinctCount: 99,
expected: []cat.HistogramBucket{},
},
{ // All zero NumEq.
h: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
{NumRange: 10, NumEq: 0, DistinctRange: 5, UpperBound: f(100)},
{NumRange: 10, NumEq: 0, DistinctRange: 10, UpperBound: f(200)},
},
rowCount: 100,
distinctCount: 60,
expected: []cat.HistogramBucket{
{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
{NumRange: 50, NumEq: 0, DistinctRange: 27.5, UpperBound: f(100)},
{NumRange: 50, NumEq: 0, DistinctRange: 32.5, UpperBound: f(200)},
},
},
}

evalCtx := tree.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())
Expand Down