Skip to content


stats: correct fast analyze stats caclulation (pingcap#10766) (pingca…
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and zz-jason committed Jun 24, 2019
1 parent 6f928aa commit 23cf1ad
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 17 deletions.
2 changes: 2 additions & 0 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,8 @@ func (e *AnalyzeFastExec) runTasks() ([]*statistics.Histogram, []*statistics.CMS
collector.Samples = collector.Samples[:e.sampCursor]
sort.Slice(collector.Samples, func(i, j int) bool { return collector.Samples[i].RowID < collector.Samples[j].RowID })
// Scale the total column size.
collector.TotalSize *= rowCount / int64(len(collector.Samples))
if i < hasPKInfo {
hists[i], cms[i], err = e.buildColumnStats(e.pkInfo.ID, e.collectors[i], &e.pkInfo.FieldType, rowCount)
} else if i < hasPKInfo+len(e.colsInfo) {
Expand Down
6 changes: 4 additions & 2 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {

tk.MustExec("use test")
tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a int primary key, b int, index index_b(b))")
tk.MustExec("create table t(a int primary key, b int, c char(10), index index_b(b))")
tk.MustExec("set @@session.tidb_enable_fast_analyze=1")
tk.MustExec("set @@session.tidb_build_stats_concurrency=1")
tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
Expand All @@ -246,7 +246,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
manipulateCluster(cluster, splitKeys)

for i := 0; i < 3000; i++ {
tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
tk.MustExec(fmt.Sprintf(`insert into t values (%d, %d, "char")`, i, i))
tk.MustExec("analyze table t with 5 buckets")

Expand All @@ -268,6 +268,8 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
"num: 603 lower_bound: 1250 upper_bound: 1823 repeats: 1\n"+
"num: 603 lower_bound: 1830 upper_bound: 2379 repeats: 1\n"+
"num: 588 lower_bound: 2380 upper_bound: 2998 repeats: 1\n"+
"column:3 ndv:1 totColSize:12000\n"+
"num: 3000 lower_bound: char upper_bound: char repeats: 3000\n"+
"index:1 ndv:3000\n"+
"num: 603 lower_bound: 0 upper_bound: 658 repeats: 1\n"+
"num: 603 lower_bound: 663 upper_bound: 1248 repeats: 1\n"+
Expand Down
28 changes: 17 additions & 11 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,37 +116,43 @@ func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount ui
// In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen.
rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample)))
estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount)
c := buildCMSWithTopN(helper, d, w, scaleRatio)
c.calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal)
return c, estimateNDV, scaleRatio

func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64) (c *CMSketch) {
func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) {
c = NewCMSketch(d, w)
enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
if enableTopN {
c.topN = make(map[uint64][]*TopNMeta)
c.defaultValue = defaultVal
for counterKey, cnt := range helper.counter {
data, scaledCount := hack.Slice(string(counterKey)), cnt*scaleRatio
data := hack.Slice(string(counterKey))
// If the value only occurred once in the sample, we assumes that there is no difference with
// value that does not occurred in the sample.
rowCount := defaultVal
if cnt > 1 {
rowCount = cnt * scaleRatio
if enableTopN && cnt >= helper.lastVal {
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, scaledCount})
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
} else {
c.insertBytesByCount(data, scaledCount)
c.insertBytesByCount(data, rowCount)

func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) {
func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 {
sampleNDV := uint64(len(helper.sorted))
if rowCount <= (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio {
c.defaultValue = 1
} else {
estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
c.defaultValue = estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)
return 1
estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)

func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta {
Expand Down
8 changes: 4 additions & 4 deletions statistics/cmsketch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,21 +167,21 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
// The first two tests produces almost same avg.
zipfFactor: 1.0000001,
avgError: 48,
avgError: 30,
zipfFactor: 1.1,
avgError: 48,
avgError: 30,
zipfFactor: 2,
avgError: 128,
avgError: 89,
// If the most data lies in a narrow range, our guess may have better result.
// The error mainly comes from huge numbers.
zipfFactor: 5,
avgError: 256,
avgError: 208,
d, w := int32(5), int32(2048)
Expand Down

0 comments on commit 23cf1ad

Please sign in to comment.