Skip to content

Commit

Permalink
statistics: fix some problem related to stats async load (#57723)
Browse files Browse the repository at this point in the history
close #57722, close #57735
  • Loading branch information
winoros authored Nov 27, 2024
1 parent ca395fa commit 2b03447
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 32 deletions.
2 changes: 1 addition & 1 deletion pkg/statistics/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 37,
shard_count = 38,
deps = [
"//pkg/config",
"//pkg/meta/model",
Expand Down
10 changes: 10 additions & 0 deletions pkg/statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,13 @@ func (c *Column) StatsAvailable() bool {
// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
}

// EmptyColumn creates an empty column object. It may be used for pseudo estimation or to stop loading unexisting stats.
func EmptyColumn(tid int64, pkIsHandle bool, colInfo *model.ColumnInfo) *Column {
return &Column{
PhysicalID: tid,
Info: colInfo,
Histogram: *NewHistogram(colInfo.ID, 0, 0, 0, &colInfo.FieldType, 0, 0),
IsHandle: pkIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
}
}
46 changes: 27 additions & 19 deletions pkg/statistics/handle/storage/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,30 +632,38 @@ func CleanFakeItemsForShowHistInFlights(statsCache statstypes.StatsCache) int {
}

func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.StatsHandle, col model.TableItemID, loadFMSketch bool, fullLoad bool) (err error) {
tbl, ok := statsHandle.Get(col.TableID)
statsTbl, ok := statsHandle.Get(col.TableID)
if !ok {
return nil
}

var colInfo *model.ColumnInfo
_, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

// Now, we cannot init the column info in the ColAndIdxExistenceMap when to disable lite-init-stats.
// so we have to get the column info from the domain.
is := sctx.GetDomainInfoSchema().(infoschema.InfoSchema)
tblInfo, ok := statsHandle.TableInfoByID(is, col.TableID)
tbl, ok := statsHandle.TableInfoByID(is, col.TableID)
if !ok {
return nil
}
colInfo = tblInfo.Meta().GetColumnByID(col.ID)
tblInfo := tbl.Meta()
colInfo := tblInfo.GetColumnByID(col.ID)
if colInfo == nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

_, loadNeeded, analyzed := statsTbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
fakeCol := statistics.EmptyColumn(tblInfo.ID, tblInfo.PKIsHandle, colInfo)
statsTbl.SetCol(col.ID, fakeCol)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
}
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

hg, _, statsVer, _, err := HistMetaFromStorageWithHighPriority(sctx, &col, colInfo)
if hg == nil || err != nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
Expand Down Expand Up @@ -690,29 +698,29 @@ func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: tblInfo.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
StatsVer: statsVer,
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
// like `GetPartitionStats` called in `fmSketchFromStorage` would have modified the stats cache already.
tbl, ok = statsHandle.Get(col.TableID)
statsTbl, ok = statsHandle.Get(col.TableID)
if !ok {
return nil
}
tbl = tbl.Copy()
statsTbl = statsTbl.Copy()
if colHist.StatsAvailable() {
if fullLoad {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
} else {
colHist.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
if statsVer != statistics.Version0 {
tbl.StatsVer = int(statsVer)
statsTbl.LastAnalyzeVersion = max(statsTbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
statsTbl.StatsVer = int(statsVer)
}
}
tbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
statsTbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
if col.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.",
Expand Down Expand Up @@ -771,9 +779,9 @@ func loadNeededIndexHistograms(sctx sessionctx.Context, is infoschema.InfoSchema
tbl = tbl.Copy()
if idxHist.StatsVer != statistics.Version0 {
tbl.StatsVer = int(idxHist.StatsVer)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
}
tbl.SetIdx(idx.ID, idxHist)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
if idx.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for index should already be loaded as sync but not found.",
Expand Down
8 changes: 2 additions & 6 deletions pkg/statistics/handle/syncload/stats_syncload.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,9 @@ func (s *statsSyncLoad) handleOneItemTask(task *statstypes.NeededItemTask) (err

// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
wrapper.col = &statistics.Column{
PhysicalID: item.TableID,
Info: wrapper.colInfo,
Histogram: *statistics.NewHistogram(item.ID, 0, 0, 0, &wrapper.colInfo.FieldType, 0, 0),
IsHandle: isPkIsHandle && mysql.HasPriKeyFlag(wrapper.colInfo.GetFlag()),
}
wrapper.col = statistics.EmptyColumn(item.TableID, isPkIsHandle, wrapper.colInfo)
s.updateCachedItem(tblInfo, item, wrapper.col, wrapper.idx, task.Item.FullLoad)
return nil
}
Expand Down
22 changes: 22 additions & 0 deletions pkg/statistics/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,25 @@ func TestGlobalIndexWithAnalyzeVersion1AndHistoricalStats(t *testing.T) {
// Each analyze will only generate one record
tk.MustQuery(fmt.Sprintf("select count(*) from mysql.stats_history where table_id=%d", tblID)).Equal(testkit.Rows("10"))
}

func TestLastAnalyzeVersionNotChangedWithAsyncStatsLoad(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)

tk.MustExec("set @@tidb_stats_load_sync_wait = 0;")
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int);")
require.NoError(t, dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh()))
require.NoError(t, dom.StatsHandle().Update(context.Background(), dom.InfoSchema()))
tk.MustExec("insert into t values (1, 1);")
err := dom.StatsHandle().DumpStatsDeltaToKV(true)
require.NoError(t, err)
tk.MustExec("alter table t add column c int default 1;")
dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh())
tk.MustExec("select * from t where a = 1 or b = 1 or c = 1;")
require.NoError(t, dom.StatsHandle().LoadNeededHistograms(dom.InfoSchema()))
result := tk.MustQuery("show stats_meta where table_name = 't'")
require.Len(t, result.Rows(), 1)
// The last analyze time.
require.Equal(t, "<nil>", result.Rows()[0][6])
}
13 changes: 7 additions & 6 deletions pkg/statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ func (t *Table) GetStatsHealthy() (int64, bool) {
}

// ColumnIsLoadNeeded checks whether the column needs trigger the async/sync load.
// The Column should be visible in the table and really has analyzed statistics in the stroage.
// The Column should be visible in the table and really has analyzed statistics in the storage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Column together with the checking result, to avoid accessing the map multiple times.
// The first bool is whether we need to load it into memory. The second bool is whether this column has stats in the system table or not.
Expand All @@ -820,23 +820,24 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool
return nil, false, false
}
// when we use non-lite init stats, it cannot init the stats for common columns.
// so we need to foce to load the stats.
// so we need to force to load the stats.
col, ok := t.columns[id]
if !ok {
return nil, true, true
}
hasAnalyzed := t.ColAndIdxExistenceMap.HasAnalyzed(id, false)

// If it's not analyzed yet.
// The real check condition: !ok && !hashAnalyzed.
// After this check, we will always have ok && hasAnalyzed.
if !hasAnalyzed {
return nil, false, false
}

// Restore the condition from the simplified form:
// 1. !ok && hasAnalyzed => need load
// 2. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 3. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if !ok || (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
// 1. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 2. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
return col, true, true
}

Expand Down

0 comments on commit 2b03447

Please sign in to comment.