diff --git a/executor/analyze.go b/executor/analyze.go index b3564606cc75e..3e9dbedd6adba 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -1075,6 +1075,8 @@ func (e *AnalyzeFastExec) runTasks() ([]*statistics.Histogram, []*statistics.CMS collector.Samples = collector.Samples[:e.sampCursor] sort.Slice(collector.Samples, func(i, j int) bool { return collector.Samples[i].RowID < collector.Samples[j].RowID }) collector.CalcTotalSize() + // Scale the total column size. + collector.TotalSize *= rowCount / int64(len(collector.Samples)) if i < hasPKInfo { hists[i], cms[i], err = e.buildColumnStats(e.pkInfo.ID, e.collectors[i], &e.pkInfo.FieldType, rowCount) } else if i < hasPKInfo+len(e.colsInfo) { diff --git a/executor/analyze_test.go b/executor/analyze_test.go index 11d29a5d32bba..c2cf8d9fef34b 100644 --- a/executor/analyze_test.go +++ b/executor/analyze_test.go @@ -234,7 +234,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) { tk.MustExec("use test") tk.MustExec("drop table if exists t") - tk.MustExec("create table t(a int primary key, b int, index index_b(b))") + tk.MustExec("create table t(a int primary key, b int, c char(10), index index_b(b))") tk.MustExec("set @@session.tidb_enable_fast_analyze=1") tk.MustExec("set @@session.tidb_build_stats_concurrency=1") tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) @@ -246,7 +246,7 @@ func (s *testSuite1) TestFastAnalyze(c *C) { manipulateCluster(cluster, splitKeys) for i := 0; i < 3000; i++ { - tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i)) + tk.MustExec(fmt.Sprintf(`insert into t values (%d, %d, "char")`, i, i)) } tk.MustExec("analyze table t with 5 buckets") @@ -268,6 +268,8 @@ func (s *testSuite1) TestFastAnalyze(c *C) { "num: 603 lower_bound: 1250 upper_bound: 1823 repeats: 1\n"+ "num: 603 lower_bound: 1830 upper_bound: 2379 repeats: 1\n"+ "num: 588 lower_bound: 2380 upper_bound: 2998 repeats: 1\n"+ + "column:3 ndv:1 totColSize:12000\n"+ + "num: 3000 lower_bound: char upper_bound: char repeats: 3000\n"+ "index:1 ndv:3000\n"+ "num: 603 lower_bound: 0 upper_bound: 658 repeats: 1\n"+ "num: 603 lower_bound: 663 upper_bound: 1248 repeats: 1\n"+ diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 51638bcee2bff..7a7b5750c453f 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -116,37 +116,43 @@ func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount ui // In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen. rowCount = mathutil.MaxUint64(rowCount, uint64(len(sample))) estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount) - c := buildCMSWithTopN(helper, d, w, scaleRatio) - c.calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount) + defaultVal := calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount) + c := buildCMSWithTopN(helper, d, w, scaleRatio, defaultVal) return c, estimateNDV, scaleRatio } -func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64) (c *CMSketch) { +func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, defaultVal uint64) (c *CMSketch) { c = NewCMSketch(d, w) enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN if enableTopN { c.topN = make(map[uint64][]*TopNMeta) } + c.defaultValue = defaultVal for counterKey, cnt := range helper.counter { - data, scaledCount := hack.Slice(string(counterKey)), cnt*scaleRatio + data := hack.Slice(string(counterKey)) + // If the value only occurred once in the sample, we assumes that there is no difference with + // value that does not occurred in the sample. + rowCount := defaultVal + if cnt > 1 { + rowCount = cnt * scaleRatio + } if enableTopN && cnt >= helper.lastVal { h1, h2 := murmur3.Sum128(data) - c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, scaledCount}) + c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount}) } else { - c.insertBytesByCount(data, scaledCount) + c.insertBytesByCount(data, rowCount) } } return } -func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) { +func calculateDefaultVal(helper *topNHelper, estimateNDV, scaleRatio, rowCount uint64) uint64 { sampleNDV := uint64(len(helper.sorted)) if rowCount <= (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio { - c.defaultValue = 1 - } else { - estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio - c.defaultValue = estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems) + return 1 } + estimateRemainingCount := rowCount - (helper.sampleSize-uint64(helper.onlyOnceItems))*scaleRatio + return estimateRemainingCount / mathutil.MaxUint64(1, estimateNDV-uint64(sampleNDV)+helper.onlyOnceItems) } func (c *CMSketch) findTopNMeta(h1, h2 uint64, d []byte) *TopNMeta { diff --git a/statistics/cmsketch_test.go b/statistics/cmsketch_test.go index 4567e94f73b81..7e4261af328f1 100644 --- a/statistics/cmsketch_test.go +++ b/statistics/cmsketch_test.go @@ -167,21 +167,21 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) { // The first two tests produces almost same avg. { zipfFactor: 1.0000001, - avgError: 48, + avgError: 30, }, { zipfFactor: 1.1, - avgError: 48, + avgError: 30, }, { zipfFactor: 2, - avgError: 128, + avgError: 89, }, // If the most data lies in a narrow range, our guess may have better result. // The error mainly comes from huge numbers. { zipfFactor: 5, - avgError: 256, + avgError: 208, }, } d, w := int32(5), int32(2048)