Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: correct fast analyze stats caclulation #10766

Merged
merged 3 commits into from
Jun 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,8 @@ func (e *AnalyzeFastExec) buildHist(ID int64, collector *statistics.SampleCollec
// build CMSketch
var ndv, scaleRatio uint64
collector.CMSketch, ndv, scaleRatio = statistics.NewCMSketchWithTopN(defaultCMSketchDepth, defaultCMSketchWidth, data, 20, uint64(rowCount))
// Scale the total column size.
collector.TotalSize *= rowCount / int64(len(collector.Samples))
// build Histogram
hist, err := statistics.BuildColumnHist(e.ctx, int64(e.maxNumBuckets), ID, collector, tp, rowCount, int64(ndv), collector.NullCount*int64(scaleRatio))
if err != nil {
Expand Down
6 changes: 4 additions & 2 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {

tk.MustExec("use test")
tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a int primary key, b int, index index_b(b))")
tk.MustExec("create table t(a int primary key, b int, c char(10), index index_b(b))")
tk.MustExec("set @@session.tidb_enable_fast_analyze=1")
tk.MustExec("set @@session.tidb_build_stats_concurrency=1")
tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
Expand All @@ -256,7 +256,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
manipulateCluster(cluster, splitKeys)

for i := 0; i < 3000; i++ {
tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
tk.MustExec(fmt.Sprintf(`insert into t values (%d, %d, "char")`, i, i))
}
tk.MustExec("analyze table t with 5 buckets")

Expand All @@ -278,6 +278,8 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
"num: 603 lower_bound: 1250 upper_bound: 1823 repeats: 1\n"+
"num: 603 lower_bound: 1830 upper_bound: 2379 repeats: 1\n"+
"num: 588 lower_bound: 2380 upper_bound: 2998 repeats: 1\n"+
"column:3 ndv:1 totColSize:12000\n"+
"num: 3000 lower_bound: char upper_bound: char repeats: 3000\n"+
"index:1 ndv:3000\n"+
"num: 603 lower_bound: 0 upper_bound: 658 repeats: 1\n"+
"num: 603 lower_bound: 663 upper_bound: 1248 repeats: 1\n"+
Expand Down
28 changes: 17 additions & 11 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,37 +117,43 @@ func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount ui
// In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen.
rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample)))
estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount)
c := buildCMSWithTopN(helper, d, w, scaleRatio)
c.calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal)
return c, estimateNDV, scaleRatio
}

func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64) (c *CMSketch) {
func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) {
c = NewCMSketch(d, w)
enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
if enableTopN {
c.topN = make(map[uint64][]*TopNMeta)
}
c.defaultValue = defaultVal
for counterKey, cnt := range helper.counter {
data, scaledCount := hack.Slice(string(counterKey)), cnt*scaleRatio
data := hack.Slice(string(counterKey))
// If the value only occurred once in the sample, we assumes that there is no difference with
// value that does not occurred in the sample.
rowCount := defaultVal
if cnt > 1 {
rowCount = cnt * scaleRatio
}
if enableTopN && cnt >= helper.lastVal {
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, scaledCount})
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
} else {
c.insertBytesByCount(data, scaledCount)
c.insertBytesByCount(data, rowCount)
}
}
return
}

func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) {
func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 {
sampleNDV := uint64(len(helper.sorted))
if rowCount <= (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio {
c.defaultValue = 1
} else {
estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
c.defaultValue = estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)
return 1
}
estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)
}

func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta {
Expand Down
8 changes: 4 additions & 4 deletions statistics/cmsketch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,21 +172,21 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
// The first two tests produces almost same avg.
{
zipfFactor: 1.0000001,
avgError: 48,
avgError: 30,
},
{
zipfFactor: 1.1,
avgError: 48,
avgError: 30,
},
{
zipfFactor: 2,
avgError: 128,
avgError: 89,
},
// If the most data lies in a narrow range, our guess may have better result.
// The error mainly comes from huge numbers.
{
zipfFactor: 5,
avgError: 256,
avgError: 208,
},
}
d, w := int32(5), int32(2048)
Expand Down