diff --git a/statistics/handle/dump.go b/statistics/handle/dump.go index 71038cd9a74c5..72273b87dd60a 100644 --- a/statistics/handle/dump.go +++ b/statistics/handle/dump.go @@ -300,18 +300,21 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J } hist := statistics.HistogramFromProto(jsonCol.Histogram) sc := &stmtctx.StatementContext{TimeZone: time.UTC} - // Deal with sortKey, the length of sortKey maybe longer than the column's length. - orgLen := colInfo.FieldType.GetFlen() - if types.IsString(colInfo.FieldType.GetType()) { - colInfo.SetFlen(types.UnspecifiedLength) + tmpFT := colInfo.FieldType + // For new collation data, when storing the bounds of the histogram, we store the collate key instead of the + // original value. + // But there's additional conversion logic for new collation data, and the collate key might be longer than + // the FieldType.flen. + // If we use the original FieldType here, there might be errors like "Invalid utf8mb4 character string" + // or "Data too long". + // So we change it to TypeBlob to bypass those logics here. + if colInfo.FieldType.EvalType() == types.ETString && colInfo.FieldType.GetType() != mysql.TypeEnum && colInfo.FieldType.GetType() != mysql.TypeSet { + tmpFT = *types.NewFieldType(mysql.TypeBlob) } - hist, err := hist.ConvertTo(sc, &colInfo.FieldType) + hist, err := hist.ConvertTo(sc, &tmpFT) if err != nil { return nil, errors.Trace(err) } - if types.IsString(colInfo.FieldType.GetType()) { - colInfo.SetFlen(orgLen) - } cm, topN := statistics.CMSketchAndTopNFromProto(jsonCol.CMSketch) fms := statistics.FMSketchFromProto(jsonCol.FMSketch) hist.ID, hist.NullCount, hist.LastUpdateVersion, hist.TotColSize, hist.Correlation = colInfo.ID, jsonCol.NullCount, jsonCol.LastUpdateVersion, jsonCol.TotColSize, jsonCol.Correlation diff --git a/statistics/handle/dump_test.go b/statistics/handle/dump_test.go index d2c67d49dcdbe..f43fe927f00aa 100644 --- a/statistics/handle/dump_test.go +++ b/statistics/handle/dump_test.go @@ -400,7 +400,7 @@ func TestLoadStatsForNewCollation(t *testing.T) { tk.MustExec("use test") tk.MustExec("drop table if exists t") tk.MustExec("create table t(a int, b varchar(3) collate utf8mb4_unicode_ci)") - tk.MustExec("insert into t value(1, 'aaa'), (3, 'aab'), (5, 'bba'), (2, 'bbb'), (4, 'cca'), (6, 'ccc')") + tk.MustExec("insert into t value(1, 'aaa'), (1, 'aaa'), (3, 'aab'), (3, 'aab'), (5, 'bba'), (2, 'bbb'), (4, 'cca'), (6, 'ccc'), (7, 'Ste')") // mark column stats as needed tk.MustExec("select * from t where a = 3") tk.MustExec("select * from t where b = 'bbb'") diff --git a/statistics/handle/handle.go b/statistics/handle/handle.go index c213338d44b8e..37bed98516de2 100644 --- a/statistics/handle/handle.go +++ b/statistics/handle/handle.go @@ -1355,9 +1355,13 @@ func (h *Handle) histogramFromStorage(reader *statsReader, tableID int64, colID } else { sc := &stmtctx.StatementContext{TimeZone: time.UTC} d := rows[i].GetDatum(2, &fields[2].Column.FieldType) - // When there's new collation data, the length of bounds of histogram(the collate key) might be - // longer than the FieldType.flen of this column. - // We change it to TypeBlob to bypass the length check here. + // For new collation data, when storing the bounds of the histogram, we store the collate key instead of the + // original value. + // But there's additional conversion logic for new collation data, and the collate key might be longer than + // the FieldType.flen. + // If we use the original FieldType here, there might be errors like "Invalid utf8mb4 character string" + // or "Data too long". + // So we change it to TypeBlob to bypass those logics here. if tp.EvalType() == types.ETString && tp.GetType() != mysql.TypeEnum && tp.GetType() != mysql.TypeSet { tp = types.NewFieldType(mysql.TypeBlob) }