stats: correct fast analyze stats caclulation (pingcap#10766) (pingca…

…p#10918)
XuHuaiyu · Jun 24, 2019 · 23cf1ad · 23cf1ad
1 parent 6f928aa
commit 23cf1ad
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 17 deletions.
diff --git a/executor/analyze.go b/executor/analyze.go
@@ -1075,6 +1075,8 @@ func (e *AnalyzeFastExec) runTasks() ([]*statistics.Histogram, []*statistics.CMS
 		collector.Samples = collector.Samples[:e.sampCursor]
 		sort.Slice(collector.Samples, func(i, j int) bool { return collector.Samples[i].RowID < collector.Samples[j].RowID })
 		collector.CalcTotalSize()
+		// Scale the total column size.
+		collector.TotalSize *= rowCount / int64(len(collector.Samples))
 		if i < hasPKInfo {
 			hists[i], cms[i], err = e.buildColumnStats(e.pkInfo.ID, e.collectors[i], &e.pkInfo.FieldType, rowCount)
 		} else if i < hasPKInfo+len(e.colsInfo) {

diff --git a/executor/analyze_test.go b/executor/analyze_test.go
@@ -234,7 +234,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
 
 	tk.MustExec("use test")
 	tk.MustExec("drop table if exists t")
-	tk.MustExec("create table t(a int primary key, b int, index index_b(b))")
+	tk.MustExec("create table t(a int primary key, b int, c char(10), index index_b(b))")
 	tk.MustExec("set @@session.tidb_enable_fast_analyze=1")
 	tk.MustExec("set @@session.tidb_build_stats_concurrency=1")
 	tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
@@ -246,7 +246,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
 	manipulateCluster(cluster, splitKeys)
 
 	for i := 0; i < 3000; i++ {
-		tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
+		tk.MustExec(fmt.Sprintf(`insert into t values (%d, %d, "char")`, i, i))
 	}
 	tk.MustExec("analyze table t with 5 buckets")
 
@@ -268,6 +268,8 @@ func (s *testSuite1) TestFastAnalyze(c *C) {
 		"num: 603 lower_bound: 1250 upper_bound: 1823 repeats: 1\n"+
 		"num: 603 lower_bound: 1830 upper_bound: 2379 repeats: 1\n"+
 		"num: 588 lower_bound: 2380 upper_bound: 2998 repeats: 1\n"+
+		"column:3 ndv:1 totColSize:12000\n"+
+		"num: 3000 lower_bound: char upper_bound: char repeats: 3000\n"+
 		"index:1 ndv:3000\n"+
 		"num: 603 lower_bound: 0 upper_bound: 658 repeats: 1\n"+
 		"num: 603 lower_bound: 663 upper_bound: 1248 repeats: 1\n"+

diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go
@@ -116,37 +116,43 @@ func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount ui
 	// In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen.
 	rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample)))
 	estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount)
-	c := buildCMSWithTopN(helper, d, w, scaleRatio)
-	c.calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
+	defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount)
+	c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal)
 	return c, estimateNDV, scaleRatio
 }
 
-func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64) (c *CMSketch) {
+func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) {
 	c = NewCMSketch(d, w)
 	enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
 	if enableTopN {
 		c.topN = make(map[uint64][]*TopNMeta)
 	}
+	c.defaultValue = defaultVal
 	for counterKey, cnt := range helper.counter {
-		data, scaledCount := hack.Slice(string(counterKey)), cnt*scaleRatio
+		data := hack.Slice(string(counterKey))
+		// If the value only occurred once in the sample, we assumes that there is no difference with
+		// value that does not occurred in the sample.
+		rowCount := defaultVal
+		if cnt > 1 {
+			rowCount = cnt * scaleRatio
+		}
 		if enableTopN && cnt >= helper.lastVal {
 			h1, h2 := murmur3.Sum128(data)
-			c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, scaledCount})
+			c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
 		} else {
-			c.insertBytesByCount(data, scaledCount)
+			c.insertBytesByCount(data, rowCount)
 		}
 	}
 	return
 }
 
-func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) {
+func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 {
 	sampleNDV := uint64(len(helper.sorted))
 	if rowCount <= (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio {
-		c.defaultValue = 1
-	} else {
-		estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
-		c.defaultValue = estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)
+		return 1
 	}
+	estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio
+	return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems)
 }
 
 func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta {

diff --git a/statistics/cmsketch_test.go b/statistics/cmsketch_test.go
@@ -167,21 +167,21 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
 		// The first two tests produces almost same avg.
 		{
 			zipfFactor: 1.0000001,
-			avgError:   48,
+			avgError:   30,
 		},
 		{
 			zipfFactor: 1.1,
-			avgError:   48,
+			avgError:   30,
 		},
 		{
 			zipfFactor: 2,
-			avgError:   128,
+			avgError:   89,
 		},
 		// If the most data lies in a narrow range, our guess may have better result.
 		// The error mainly comes from huge numbers.
 		{
 			zipfFactor: 5,
-			avgError:   256,
+			avgError:   208,
 		},
 	}
 	d, w := int32(5), int32(2048)