From c578d19c9fd7a7d8c1e34c00babc61e02947c6ee Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 30 Jun 2022 15:36:39 +0800 Subject: [PATCH] parser, ddl: support decoding binary literal in set/enum (#35822) close pingcap/tidb#31338 --- .../r/new_character_set_invalid.result | 32 +++++++++++++ .../t/new_character_set_invalid.test | 15 +++++++ ddl/ddl_api.go | 21 +++++++++ ddl/integration_test.go | 14 +++--- parser/ast/misc.go | 22 --------- parser/ast/util_test.go | 2 + parser/parser.go | 22 ++++----- parser/parser.y | 22 ++++----- parser/parser_test.go | 2 + parser/types/field_type.go | 45 +++++++++++++++---- 10 files changed, 140 insertions(+), 57 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 92e43d6c747fa..bb9ae4aa6db2b 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -21,3 +21,35 @@ a b c ? ? ? 中文?中文 asdf?fdsa 字符集?字符集 @@ @@ @@ +set @@sql_mode = default; +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `f` set('һ','??') DEFAULT NULL, + `e` enum('??','jY') DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin +drop table t; +create table t( e enum(0xBAEC,0x6A59)); +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `e` enum('??','jY') DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `f` set('一','三') COLLATE gbk_bin DEFAULT NULL, + `e` enum('红','jY') COLLATE gbk_bin DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=gbk COLLATE=gbk_bin +drop table t; +create table t( e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `e` enum('红','jY') COLLATE gbk_bin DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=gbk COLLATE=gbk_bin +set @@sql_mode = ''; diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index eaed9ba78c518..369f72362cfc7 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -15,3 +15,18 @@ insert into t values ('À', 'ø', '😂'); insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; + +set @@sql_mode = default; +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); +show create table t; +drop table t; +create table t( e enum(0xBAEC,0x6A59)); +show create table t; +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +drop table t; +create table t( e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +set @@sql_mode = ''; diff --git a/ddl/ddl_api.go b/ddl/ddl_api.go index 2dcde0a82f03b..bb3f4ba4c28d0 100644 --- a/ddl/ddl_api.go +++ b/ddl/ddl_api.go @@ -56,6 +56,7 @@ import ( "github.com/pingcap/tidb/util/collate" "github.com/pingcap/tidb/util/dbterror" "github.com/pingcap/tidb/util/domainutil" + "github.com/pingcap/tidb/util/hack" "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/mathutil" "github.com/pingcap/tidb/util/mock" @@ -820,6 +821,24 @@ func setCharsetCollationFlenDecimal(tp *types.FieldType, colName, colCharset, co return checkTooBigFieldLengthAndTryAutoConvert(tp, colName, sessVars) } +func decodeEnumSetBinaryLiteralToUTF8(tp *types.FieldType, chs string) { + if tp.GetType() != mysql.TypeEnum && tp.GetType() != mysql.TypeSet { + return + } + enc := charset.FindEncoding(chs) + for i, elem := range tp.GetElems() { + if !tp.GetElemIsBinaryLit(i) { + continue + } + s, err := enc.Transform(nil, hack.Slice(elem), charset.OpDecodeReplace) + if err != nil { + logutil.BgLogger().Warn("decode enum binary literal to utf-8 failed", zap.Error(err)) + } + tp.SetElem(i, string(hack.String(s))) + } + tp.CleanElemIsBinaryLit() +} + // buildColumnAndConstraint builds table.Column and ast.Constraint from the parameters. // outPriKeyConstraint is the primary key constraint out of column definition. For example: // `create table t1 (id int , age int, primary key(id));` @@ -852,6 +871,7 @@ func buildColumnAndConstraint( if err := setCharsetCollationFlenDecimal(colDef.Tp, colDef.Name.Name.O, chs, coll, ctx.GetSessionVars()); err != nil { return nil, nil, errors.Trace(err) } + decodeEnumSetBinaryLiteralToUTF8(colDef.Tp, chs) col, cts, err := columnDefToCol(ctx, offset, colDef, outPriKeyConstraint) if err != nil { return nil, nil, errors.Trace(err) @@ -4523,6 +4543,7 @@ func (d *ddl) getModifiableColumnJob(ctx context.Context, sctx sessionctx.Contex if err = setCharsetCollationFlenDecimal(&newCol.FieldType, newCol.Name.O, chs, coll, sctx.GetSessionVars()); err != nil { return nil, errors.Trace(err) } + decodeEnumSetBinaryLiteralToUTF8(&newCol.FieldType, chs) // Check the column with foreign key, waiting for the default flen and decimal. if fkInfo := getColumnForeignKeyInfo(originalColName.L, t.Meta().ForeignKeys); fkInfo != nil { diff --git a/ddl/integration_test.go b/ddl/integration_test.go index 34edd8dfe34d6..99ba587322f88 100644 --- a/ddl/integration_test.go +++ b/ddl/integration_test.go @@ -68,19 +68,19 @@ func TestDefaultValueInEnum(t *testing.T) { tk.MustExec("use test;") // The value 0x91 should not cause panic. tk.MustExec("create table t(a enum('a', 0x91) charset gbk);") - tk.MustExec("insert into t values (1), (2);") // Use 1-base index to locate the value. - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "")) // 0x91 is truncate. + tk.MustExec("insert into t values (1), (2);") // Use 1-base index to locate the value. + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "?")) // 0x91 is replaced to '?'. tk.MustExec("drop table t;") tk.MustExec("create table t (a enum('a', 0x91)) charset gbk;") // Test for table charset. tk.MustExec("insert into t values (1), (2);") - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "")) + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "?")) tk.MustExec("drop table t;") - tk.MustGetErrMsg("create table t(a set('a', 0x91, '') charset gbk);", - "[types:1291]Column 'a' has duplicated value '' in SET") - // Test valid utf-8 string value in enum. Note that the binary literal only can be decoded to utf-8. + tk.MustGetErrMsg("create table t(a set('a', 0x91, '?') charset gbk);", + "[types:1291]Column 'a' has duplicated value '?' in SET") + // Test valid utf-8 string value in enum. tk.MustExec("create table t (a enum('a', 0xE4BDA0E5A5BD) charset gbk);") tk.MustExec("insert into t values (1), (2);") - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "你好")) + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "浣犲ソ")) } func TestDDLStatementsBackFill(t *testing.T) { diff --git a/parser/ast/misc.go b/parser/ast/misc.go index 38c465568634a..08e14575c53eb 100644 --- a/parser/ast/misc.go +++ b/parser/ast/misc.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb/parser/auth" - "github.com/pingcap/tidb/parser/charset" "github.com/pingcap/tidb/parser/format" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/parser/mysql" @@ -3619,27 +3618,6 @@ type TextString struct { IsBinaryLiteral bool } -// TransformTextStrings converts a slice of TextString to strings. -// This is only used by enum/set strings. -func TransformTextStrings(ts []*TextString, _ string) []string { - // The UTF-8 encoding rather than other encoding is used - // because parser is not possible to determine the "real" - // charset that a binary literal string should be converted to. - enc := charset.EncodingUTF8Impl - ret := make([]string, 0, len(ts)) - for _, t := range ts { - if !t.IsBinaryLiteral { - ret = append(ret, t.Value) - } else { - // Validate the binary literal string. - // See https://github.com/pingcap/tidb/issues/30740. - r, _ := enc.Transform(nil, charset.HackSlice(t.Value), charset.OpDecodeNoErr) - ret = append(ret, charset.HackString(r)) - } - } - return ret -} - type BinaryLiteral interface { ToString() string } diff --git a/parser/ast/util_test.go b/parser/ast/util_test.go index 015f5dc5cc4eb..b43fac39ae481 100644 --- a/parser/ast/util_test.go +++ b/parser/ast/util_test.go @@ -147,6 +147,8 @@ func (checker *nodeTextCleaner) Enter(in Node) (out Node, skipChildren bool) { } case *Join: node.ExplicitParens = false + case *ColumnDef: + node.Tp.CleanElemIsBinaryLit() } return in, false } diff --git a/parser/parser.go b/parser/parser.go index 051b5550596c1..9f9112dc76731 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -19881,12 +19881,13 @@ yynewstate: tp := types.NewFieldType(mysql.TypeEnum) elems := yyS[yypt-2].item.([]*ast.TextString) opt := yyS[yypt-0].item.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) + tp.SetElems(make([]string, len(elems))) fieldLen := -1 // enum_flen = max(ele_flen) - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - if len(tp.GetElem(i)) > fieldLen { - fieldLen = len(tp.GetElem(i)) + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + if len(trimmed) > fieldLen { + fieldLen = len(trimmed) } } tp.SetFlen(fieldLen) @@ -19901,11 +19902,12 @@ yynewstate: tp := types.NewFieldType(mysql.TypeSet) elems := yyS[yypt-2].item.([]*ast.TextString) opt := yyS[yypt-0].item.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) - fieldLen := len(tp.GetElems()) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - fieldLen += len(tp.GetElem(i)) + tp.SetElems(make([]string, len(elems))) + fieldLen := len(elems) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + fieldLen += len(trimmed) } tp.SetFlen(fieldLen) tp.SetCharset(opt.Charset) diff --git a/parser/parser.y b/parser/parser.y index 131fa55aaaebd..0cda00670a6eb 100644 --- a/parser/parser.y +++ b/parser/parser.y @@ -11896,12 +11896,13 @@ StringType: tp := types.NewFieldType(mysql.TypeEnum) elems := $3.([]*ast.TextString) opt := $5.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) + tp.SetElems(make([]string, len(elems))) fieldLen := -1 // enum_flen = max(ele_flen) - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - if len(tp.GetElem(i)) > fieldLen { - fieldLen = len(tp.GetElem(i)) + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + if len(trimmed) > fieldLen { + fieldLen = len(trimmed) } } tp.SetFlen(fieldLen) @@ -11916,11 +11917,12 @@ StringType: tp := types.NewFieldType(mysql.TypeSet) elems := $3.([]*ast.TextString) opt := $5.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) - fieldLen := len(tp.GetElems()) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - fieldLen += len(tp.GetElem(i)) + tp.SetElems(make([]string, len(elems))) + fieldLen := len(elems) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + fieldLen += len(trimmed) } tp.SetFlen(fieldLen) tp.SetCharset(opt.Charset) diff --git a/parser/parser_test.go b/parser/parser_test.go index 3ab44e1a232c0..f97ca11feada5 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -6192,6 +6192,8 @@ func (checker *nodeTextCleaner) Enter(in ast.Node) (out ast.Node, skipChildren b node.Specs = specs case *ast.Join: node.ExplicitParens = false + case *ast.ColumnDef: + node.Tp.CleanElemIsBinaryLit() } return in, false } diff --git a/parser/types/field_type.go b/parser/types/field_type.go index aa984e2a945b6..429896d6d790e 100644 --- a/parser/types/field_type.go +++ b/parser/types/field_type.go @@ -53,7 +53,9 @@ type FieldType struct { // collate represent collate rules of the charset collate string // elems is the element list for enum and set type. - elems []string + elems []string + elemsIsBinaryLit []bool + // Please keep in mind that jsonFieldType should be updated if you add a new field here. } // NewFieldType returns a FieldType, @@ -180,10 +182,34 @@ func (ft *FieldType) SetElem(idx int, element string) { ft.elems[idx] = element } +func (ft *FieldType) SetElemWithIsBinaryLit(idx int, element string, isBinaryLit bool) { + ft.elems[idx] = element + if isBinaryLit { + // Create the binary literal flags lazily. + if ft.elemsIsBinaryLit == nil { + ft.elemsIsBinaryLit = make([]bool, len(ft.elems)) + } + ft.elemsIsBinaryLit[idx] = true + } +} + func (ft *FieldType) GetElem(idx int) string { return ft.elems[idx] } +func (ft *FieldType) GetElemIsBinaryLit(idx int) bool { + if len(ft.elemsIsBinaryLit) == 0 { + return false + } + return ft.elemsIsBinaryLit[idx] +} + +func (ft *FieldType) CleanElemIsBinaryLit() { + if ft != nil && ft.elemsIsBinaryLit != nil { + ft.elemsIsBinaryLit = nil + } +} + // Clone returns a copy of itself. func (ft *FieldType) Clone() *FieldType { ret := *ft @@ -506,13 +532,14 @@ func HasCharset(ft *FieldType) bool { // for json type jsonFieldType struct { - Tp byte - Flag uint - Flen int - Decimal int - Charset string - Collate string - Elems []string + Tp byte + Flag uint + Flen int + Decimal int + Charset string + Collate string + Elems []string + ElemsIsBinaryLit []bool } func (ft *FieldType) UnmarshalJSON(data []byte) error { @@ -526,6 +553,7 @@ func (ft *FieldType) UnmarshalJSON(data []byte) error { ft.charset = r.Charset ft.collate = r.Collate ft.elems = r.Elems + ft.elemsIsBinaryLit = r.ElemsIsBinaryLit } return err } @@ -539,5 +567,6 @@ func (ft *FieldType) MarshalJSON() ([]byte, error) { r.Charset = ft.charset r.Collate = ft.collate r.Elems = ft.elems + r.ElemsIsBinaryLit = ft.elemsIsBinaryLit return json.Marshal(r) }