stats: refine the row count estimation for outdated stats (pingcap#7175)

alivxxx · Jul 30, 2018 · ff08c15 · ff08c15
1 parent dae50f0
commit ff08c15
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 21 deletions.
diff --git a/plan/cbo_test.go b/plan/cbo_test.go
@@ -496,8 +496,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
 	plan.RatioOfPseudoEstimate = 10.0
 	testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
 		"TableScan_5 Selection_6  cop table:t, range:[-inf,+inf], keep order:false 80.00",
-		"Selection_6  TableScan_5 cop le(test.t.a, 5), le(test.t.b, 5) 28.80",
-		"TableReader_7   root data:Selection_6 28.80",
+		"Selection_6  TableScan_5 cop le(test.t.a, 5), le(test.t.b, 5) 35.91",
+		"TableReader_7   root data:Selection_6 35.91",
 	))
 	plan.RatioOfPseudoEstimate = 0.7
 	testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
@@ -607,16 +607,16 @@ func (s *testAnalyzeSuite) TestLimit(c *C) {
 	}
 	testKit.MustExec("analyze table t")
 	testKit.MustQuery("explain select * from t use index(idx) where a > 1 and b > 1 and c > 1 limit 1").Check(testkit.Rows(
-		"IndexScan_13 Selection_15  cop table:t, index:a, b, range:(1 +inf,+inf +inf], keep order:false 1.56",
-		"Selection_15  IndexScan_13 cop gt(test.t.b, 1) 1.25",
-		"TableScan_14 Selection_16  cop table:t, keep order:false 1.25",
+		"IndexScan_13 Selection_15  cop table:t, index:a, b, range:(1 +inf,+inf +inf], keep order:false 1.10",
+		"Selection_15  IndexScan_13 cop gt(test.t.b, 1) 1.00",
+		"TableScan_14 Selection_16  cop table:t, keep order:false 1.00",
 		"Selection_16 Limit_17 TableScan_14 cop gt(test.t.c, 1) 1.00",
 		"Limit_17  Selection_16 cop offset:0, count:1 1.00",
 		"IndexLookUp_18 Limit_9  root index:Selection_15, table:Limit_17 1.00",
 		"Limit_9  IndexLookUp_18 root offset:0, count:1 1.00",
 	))
 	testKit.MustQuery("explain select * from t where a > 1 and c > 1 limit 1").Check(testkit.Rows(
-		"TableScan_11 Selection_12  cop table:t, range:(1,+inf], keep order:false 1.25",
+		"TableScan_11 Selection_12  cop table:t, range:(1,+inf], keep order:false 1.11",
 		"Selection_12 Limit_15 TableScan_11 cop gt(test.t.c, 1) 1.00",
 		"Limit_15  Selection_12 cop offset:0, count:1 1.00",
 		"TableReader_16 Limit_8  root data:Limit_15 1.00",

diff --git a/statistics/ddl_test.go b/statistics/ddl_test.go
@@ -126,7 +126,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
 	c.Assert(count, Equals, float64(2))
 	count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID)
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, float64(0))
+	c.Assert(count, Equals, float64(2))
 
 	testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP")
 	err = h.HandleDDLEvent(<-h.DDLEventCh())

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -615,6 +615,15 @@ func MergeHistograms(sc *stmtctx.StatementContext, lh *Histogram, rh *Histogram,
 	return lh, nil
 }
 
+func (hg *Histogram) outOfRange(val types.Datum) bool {
+	if hg.Bounds == nil {
+		return true
+	}
+	len := hg.Bounds.NumRows()
+	return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 ||
+		chunk.Compare(hg.Bounds.GetRow(len-1), 0, &val) < 0
+}
+
 // Column represents a column histogram.
 type Column struct {
 	Histogram
@@ -631,19 +640,22 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
 	if val.IsNull() {
 		return float64(c.NullCount), nil
 	}
-	if c.CMSketch != nil {
-		count, err := c.CMSketch.queryValue(sc, val)
-		return float64(count), errors.Trace(err)
-	}
 	// all the values is null
 	if c.Histogram.Bounds == nil {
 		return 0.0, nil
 	}
+	if c.NDV > 0 && c.outOfRange(val) {
+		return c.totalRowCount() / (float64(c.NDV)), nil
+	}
+	if c.CMSketch != nil {
+		count, err := c.CMSketch.queryValue(sc, val)
+		return float64(count), errors.Trace(err)
+	}
 	return c.Histogram.equalRowCount(val), nil
 }
 
-// getColumnRowCount estimates the row count by a slice of NewRange.
-func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.NewRange) (float64, error) {
+// getColumnRowCount estimates the row count by a slice of Range.
+func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.NewRange, modifyCount int64) (float64, error) {
 	var rowCount float64
 	for _, rg := range ranges {
 		cmp, err := rg.LowVal[0].CompareDatum(sc, &rg.HighVal[0])
@@ -664,6 +676,9 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 		}
 		// the interval case.
 		cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0])
+		if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) {
+			cnt += float64(modifyCount) / outOfRangeBetweenRate
+		}
 		if rg.LowExclude {
 			lowCnt, err := c.equalRowCount(sc, rg.LowVal[0])
 			if err != nil {
@@ -700,13 +715,17 @@ func (idx *Index) String() string {
 }
 
 func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 {
+	val := types.NewBytesDatum(b)
+	if idx.NDV > 0 && idx.outOfRange(val) {
+		return idx.totalRowCount() / (float64(idx.NDV))
+	}
 	if idx.CMSketch != nil {
 		return float64(idx.CMSketch.queryBytes(b))
 	}
-	return idx.Histogram.equalRowCount(types.NewBytesDatum(b))
+	return idx.Histogram.equalRowCount(val)
 }
 
-func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.NewRange) (float64, error) {
+func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.NewRange, modifyCount int64) (float64, error) {
 	totalCount := float64(0)
 	for _, indexRange := range indexRanges {
 		lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
@@ -733,6 +752,9 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		l := types.NewBytesDatum(lb)
 		r := types.NewBytesDatum(rb)
 		totalCount += idx.betweenRowCount(l, r)
+		if idx.outOfRange(l) || idx.outOfRange(r) {
+			totalCount += float64(modifyCount) / outOfRangeBetweenRate
+		}
 	}
 	if totalCount > idx.totalRowCount() {
 		totalCount = idx.totalRowCount()

diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
@@ -14,6 +14,7 @@
 package statistics_test
 
 import (
+	"fmt"
 	"math"
 	"os"
 	"runtime/pprof"
@@ -32,6 +33,7 @@ import (
 	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/codec"
+	"github.com/pingcap/tidb/util/ranger"
 	"github.com/pingcap/tidb/util/testkit"
 )
 
@@ -152,7 +154,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
 		},
 		{
 			exprs:       "a >= 1 and b > 1 and a < 2",
-			selectivity: 0.01783264746,
+			selectivity: 0.01817558299,
 		},
 		{
 			exprs:       "a >= 1 and c > 1 and a < 2",
@@ -168,7 +170,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
 		},
 		{
 			exprs:       "b > 1",
-			selectivity: 0.96296296296,
+			selectivity: 0.98148148148,
 		},
 		{
 			exprs:       "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
@@ -214,6 +216,59 @@ func (s *testSelectivitySuite) TestPseudoSelectivity(c *C) {
 		"Projection_4  TableReader_6 root test.t1.b 1.00"))
 }
 
+func getRange(start, end int64) []*ranger.NewRange {
+	ran := &ranger.NewRange{
+		LowVal:  []types.Datum{types.NewIntDatum(start)},
+		HighVal: []types.Datum{types.NewIntDatum(end)},
+	}
+	return []*ranger.NewRange{ran}
+}
+
+func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) {
+	testKit := testkit.NewTestKit(c, s.store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, b int, key idx(a, b))")
+	testKit.MustExec("analyze table t")
+	for i := 0; i < 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
+	}
+	h := s.dom.StatsHandle()
+	h.DumpStatsDeltaToKV()
+	testKit.MustExec("analyze table t")
+	for i := 0; i < 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i+10, i+10))
+	}
+	h.DumpStatsDeltaToKV()
+	c.Assert(h.Update(s.dom.InfoSchema()), IsNil)
+	table, err := s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	statsTbl := h.GetTableStats(table.Meta())
+
+	sc := &stmtctx.StatementContext{}
+	colID := table.Meta().Columns[0].ID
+	count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 2.0)
+
+	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 4.2)
+
+	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 4.2)
+
+	idxID := table.Meta().Indices[0].ID
+	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 0.2)
+
+	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30))
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, 2.2)
+}
+
 func BenchmarkSelectivity(b *testing.B) {
 	c := &C{}
 	s := &testSelectivitySuite{}

diff --git a/statistics/table.go b/statistics/table.go
@@ -37,6 +37,8 @@ const (
 	pseudoEqualRate   = 1000
 	pseudoLessRate    = 3
 	pseudoBetweenRate = 40
+
+	outOfRangeBetweenRate = 100
 )
 
 // Table represents statistics for a table.
@@ -327,7 +329,7 @@ func (t *Table) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, colID
 		return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(t.Count)), nil
 	}
 	c := t.Columns[colID]
-	result, err := c.getColumnRowCount(sc, intRanges)
+	result, err := c.getColumnRowCount(sc, intRanges, t.ModifyCount)
 	result *= c.getIncreaseFactor(t.Count)
 	return result, errors.Trace(err)
 }
@@ -338,7 +340,7 @@ func (t *Table) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, colID in
 		return getPseudoRowCountByColumnRanges(sc, float64(t.Count), colRanges, 0)
 	}
 	c := t.Columns[colID]
-	result, err := c.getColumnRowCount(sc, colRanges)
+	result, err := c.getColumnRowCount(sc, colRanges, t.ModifyCount)
 	result *= c.getIncreaseFactor(t.Count)
 	return result, errors.Trace(err)
 }
@@ -353,7 +355,7 @@ func (t *Table) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int
 		}
 		return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(t.Count), colsLen)
 	}
-	result, err := idx.getRowCount(sc, indexRanges)
+	result, err := idx.getRowCount(sc, indexRanges, t.ModifyCount)
 	result *= idx.getIncreaseFactor(t.Count)
 	return result, errors.Trace(err)
 }

diff --git a/statistics/update_test.go b/statistics/update_test.go
@@ -519,7 +519,7 @@ func (s *testStatsUpdateSuite) TestQueryFeedback(c *C) {
 	}
 
 	// Feedback from limit executor may not be accurate.
-	testKit.MustQuery("select * from t where t.a <= 2 limit 1")
+	testKit.MustQuery("select * from t where t.a <= 5 limit 1")
 	h.DumpStatsDeltaToKV()
 	feedback := h.GetQueryFeedback()
 	c.Assert(len(feedback), Equals, 0)