Skip to content

Commit

Permalink
stats: refine the row count estimation for outdated stats (pingcap#7175)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and Haibin Xie committed Jul 30, 2018
1 parent dae50f0 commit ff08c15
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 21 deletions.
12 changes: 6 additions & 6 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
plan.RatioOfPseudoEstimate = 10.0
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
"TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 80.00",
"Selection_6 TableScan_5 cop le(test.t.a, 5), le(test.t.b, 5) 28.80",
"TableReader_7 root data:Selection_6 28.80",
"Selection_6 TableScan_5 cop le(test.t.a, 5), le(test.t.b, 5) 35.91",
"TableReader_7 root data:Selection_6 35.91",
))
plan.RatioOfPseudoEstimate = 0.7
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
Expand Down Expand Up @@ -607,16 +607,16 @@ func (s *testAnalyzeSuite) TestLimit(c *C) {
}
testKit.MustExec("analyze table t")
testKit.MustQuery("explain select * from t use index(idx) where a > 1 and b > 1 and c > 1 limit 1").Check(testkit.Rows(
"IndexScan_13 Selection_15 cop table:t, index:a, b, range:(1 +inf,+inf +inf], keep order:false 1.56",
"Selection_15 IndexScan_13 cop gt(test.t.b, 1) 1.25",
"TableScan_14 Selection_16 cop table:t, keep order:false 1.25",
"IndexScan_13 Selection_15 cop table:t, index:a, b, range:(1 +inf,+inf +inf], keep order:false 1.10",
"Selection_15 IndexScan_13 cop gt(test.t.b, 1) 1.00",
"TableScan_14 Selection_16 cop table:t, keep order:false 1.00",
"Selection_16 Limit_17 TableScan_14 cop gt(test.t.c, 1) 1.00",
"Limit_17 Selection_16 cop offset:0, count:1 1.00",
"IndexLookUp_18 Limit_9 root index:Selection_15, table:Limit_17 1.00",
"Limit_9 IndexLookUp_18 root offset:0, count:1 1.00",
))
testKit.MustQuery("explain select * from t where a > 1 and c > 1 limit 1").Check(testkit.Rows(
"TableScan_11 Selection_12 cop table:t, range:(1,+inf], keep order:false 1.25",
"TableScan_11 Selection_12 cop table:t, range:(1,+inf], keep order:false 1.11",
"Selection_12 Limit_15 TableScan_11 cop gt(test.t.c, 1) 1.00",
"Limit_15 Selection_12 cop offset:0, count:1 1.00",
"TableReader_16 Limit_8 root data:Limit_15 1.00",
Expand Down
2 changes: 1 addition & 1 deletion statistics/ddl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
c.Assert(count, Equals, float64(2))
count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(0))
c.Assert(count, Equals, float64(2))

testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP")
err = h.HandleDDLEvent(<-h.DDLEventCh())
Expand Down
38 changes: 30 additions & 8 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,15 @@ func MergeHistograms(sc *stmtctx.StatementContext, lh *Histogram, rh *Histogram,
return lh, nil
}

func (hg *Histogram) outOfRange(val types.Datum) bool {
if hg.Bounds == nil {
return true
}
len := hg.Bounds.NumRows()
return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 ||
chunk.Compare(hg.Bounds.GetRow(len-1), 0, &val) < 0
}

// Column represents a column histogram.
type Column struct {
Histogram
Expand All @@ -631,19 +640,22 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
if val.IsNull() {
return float64(c.NullCount), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
return float64(count), errors.Trace(err)
}
// all the values is null
if c.Histogram.Bounds == nil {
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
return c.totalRowCount() / (float64(c.NDV)), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
return float64(count), errors.Trace(err)
}
return c.Histogram.equalRowCount(val), nil
}

// getColumnRowCount estimates the row count by a slice of NewRange.
func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.NewRange) (float64, error) {
// getColumnRowCount estimates the row count by a slice of Range.
func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.NewRange, modifyCount int64) (float64, error) {
var rowCount float64
for _, rg := range ranges {
cmp, err := rg.LowVal[0].CompareDatum(sc, &rg.HighVal[0])
Expand All @@ -664,6 +676,9 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
}
// the interval case.
cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0])
if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) {
cnt += float64(modifyCount) / outOfRangeBetweenRate
}
if rg.LowExclude {
lowCnt, err := c.equalRowCount(sc, rg.LowVal[0])
if err != nil {
Expand Down Expand Up @@ -700,13 +715,17 @@ func (idx *Index) String() string {
}

func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 {
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return idx.totalRowCount() / (float64(idx.NDV))
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.queryBytes(b))
}
return idx.Histogram.equalRowCount(types.NewBytesDatum(b))
return idx.Histogram.equalRowCount(val)
}

func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.NewRange) (float64, error) {
func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.NewRange, modifyCount int64) (float64, error) {
totalCount := float64(0)
for _, indexRange := range indexRanges {
lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
Expand All @@ -733,6 +752,9 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
l := types.NewBytesDatum(lb)
r := types.NewBytesDatum(rb)
totalCount += idx.betweenRowCount(l, r)
if idx.outOfRange(l) || idx.outOfRange(r) {
totalCount += float64(modifyCount) / outOfRangeBetweenRate
}
}
if totalCount > idx.totalRowCount() {
totalCount = idx.totalRowCount()
Expand Down
59 changes: 57 additions & 2 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package statistics_test

import (
"fmt"
"math"
"os"
"runtime/pprof"
Expand All @@ -32,6 +33,7 @@ import (
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/codec"
"github.com/pingcap/tidb/util/ranger"
"github.com/pingcap/tidb/util/testkit"
)

Expand Down Expand Up @@ -152,7 +154,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "a >= 1 and b > 1 and a < 2",
selectivity: 0.01783264746,
selectivity: 0.01817558299,
},
{
exprs: "a >= 1 and c > 1 and a < 2",
Expand All @@ -168,7 +170,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "b > 1",
selectivity: 0.96296296296,
selectivity: 0.98148148148,
},
{
exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
Expand Down Expand Up @@ -214,6 +216,59 @@ func (s *testSelectivitySuite) TestPseudoSelectivity(c *C) {
"Projection_4 TableReader_6 root test.t1.b 1.00"))
}

func getRange(start, end int64) []*ranger.NewRange {
ran := &ranger.NewRange{
LowVal: []types.Datum{types.NewIntDatum(start)},
HighVal: []types.Datum{types.NewIntDatum(end)},
}
return []*ranger.NewRange{ran}
}

func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) {
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int, b int, key idx(a, b))")
testKit.MustExec("analyze table t")
for i := 0; i < 10; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
}
h := s.dom.StatsHandle()
h.DumpStatsDeltaToKV()
testKit.MustExec("analyze table t")
for i := 0; i < 10; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i+10, i+10))
}
h.DumpStatsDeltaToKV()
c.Assert(h.Update(s.dom.InfoSchema()), IsNil)
table, err := s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl := h.GetTableStats(table.Meta())

sc := &stmtctx.StatementContext{}
colID := table.Meta().Columns[0].ID
count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.0)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)

idxID := table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 0.2)

count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.2)
}

func BenchmarkSelectivity(b *testing.B) {
c := &C{}
s := &testSelectivitySuite{}
Expand Down
8 changes: 5 additions & 3 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ const (
pseudoEqualRate = 1000
pseudoLessRate = 3
pseudoBetweenRate = 40

outOfRangeBetweenRate = 100
)

// Table represents statistics for a table.
Expand Down Expand Up @@ -327,7 +329,7 @@ func (t *Table) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, colID
return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(t.Count)), nil
}
c := t.Columns[colID]
result, err := c.getColumnRowCount(sc, intRanges)
result, err := c.getColumnRowCount(sc, intRanges, t.ModifyCount)
result *= c.getIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand All @@ -338,7 +340,7 @@ func (t *Table) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, colID in
return getPseudoRowCountByColumnRanges(sc, float64(t.Count), colRanges, 0)
}
c := t.Columns[colID]
result, err := c.getColumnRowCount(sc, colRanges)
result, err := c.getColumnRowCount(sc, colRanges, t.ModifyCount)
result *= c.getIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand All @@ -353,7 +355,7 @@ func (t *Table) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int
}
return getPseudoRowCountByIndexRanges(sc, indexRanges, float64(t.Count), colsLen)
}
result, err := idx.getRowCount(sc, indexRanges)
result, err := idx.getRowCount(sc, indexRanges, t.ModifyCount)
result *= idx.getIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand Down
2 changes: 1 addition & 1 deletion statistics/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ func (s *testStatsUpdateSuite) TestQueryFeedback(c *C) {
}

// Feedback from limit executor may not be accurate.
testKit.MustQuery("select * from t where t.a <= 2 limit 1")
testKit.MustQuery("select * from t where t.a <= 5 limit 1")
h.DumpStatsDeltaToKV()
feedback := h.GetQueryFeedback()
c.Assert(len(feedback), Equals, 0)
Expand Down

0 comments on commit ff08c15

Please sign in to comment.