Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

charset: implement utf8_unicode_ci and utf8mb4_unicode_ci collation (#18776) #22558

Merged
merged 9 commits into from
Jan 28, 2021
29 changes: 23 additions & 6 deletions ddl/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"github.com/pingcap/tidb/tablecodec"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/admin"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/domainutil"
"github.com/pingcap/tidb/util/israce"
"github.com/pingcap/tidb/util/mock"
Expand Down Expand Up @@ -2205,19 +2206,35 @@ func (s *testDBSuite1) TestCreateTable(c *C) {
s.tk.MustExec("use test")
failSQL := "create table t_enum (a enum('e','e'));"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('e','E'));"
}

func (s *testSerialDBSuite) TestCreateTableWithCollation(c *C) {
collate.SetNewCollationEnabledForTest(true)
AilinKid marked this conversation as resolved.
Show resolved Hide resolved
defer collate.SetNewCollationEnabledForTest(false)
s.tk.MustExec("use test")
failSQL := "create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('abc','Abc')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('abc','Abc'));"
failSQL = "create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
// test for set column
failSQL = "create table t_enum (a set('e','e'));"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('e','E'));"
failSQL = "create table t_enum (a set('e','E')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('abc','Abc')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
_, err := s.tk.Exec("create table t_enum (a enum('B','b')) charset=utf8 collate=utf8_general_ci;")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'b' in ENUM")
failSQL = "create table t_enum (a set('e','E')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('abc','Abc'));"
failSQL = "create table t_enum (a set('ss','ß')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
_, err = s.tk.Exec("create table t_enum (a enum('B','b'));")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'B' in ENUM")
_, err = s.tk.Exec("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'ß' in ENUM")
}

func (s *testDBSuite5) TestRepairTable(c *C) {
Expand Down
11 changes: 11 additions & 0 deletions executor/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,17 @@ func (s *testCollationSuite) TestVecGroupChecker(c *C) {
}
c.Assert(groupChecker.isExhausted(), IsTrue)

tp.Collate = "utf8_unicode_ci"
groupChecker.reset()
_, err = groupChecker.splitIntoGroups(chk)
c.Assert(err, IsNil)
for i := 0; i < 3; i++ {
b, e := groupChecker.getNextGroup()
c.Assert(b, Equals, i*2)
c.Assert(e, Equals, i*2+2)
}
c.Assert(groupChecker.isExhausted(), IsTrue)

// test padding
tp.Collate = "utf8_bin"
tp.Flen = 6
Expand Down
63 changes: 43 additions & 20 deletions expression/builtin_like_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,26 +59,37 @@ func (s *testEvaluatorSerialSuites) TestCILike(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)
tests := []struct {
input string
pattern string
match int
input string
pattern string
generalMatch int
unicodeMatch int
}{
{"a", "", 0},
{"a", "a", 1},
{"a", "á", 1},
{"a", "b", 0},
{"aA", "Aa", 1},
{"áAb", `Aa%`, 1},
{"áAb", `%ab%`, 1},
{"áAb", `%ab`, 1},
{"ÀAb", "aA_", 1},
{"áééá", "a_%a", 1},
{"áééá", "a%_a", 1},
{"áéá", "a_%a", 1},
{"áéá", "a%_a", 1},
{"áá", "a_%a", 0},
{"áá", "a%_a", 0},
{"áééáííí", "a_%a%", 1},
{"a", "", 0, 0},
{"a", "a", 1, 1},
{"a", "á", 1, 1},
{"a", "b", 0, 0},
{"aA", "Aa", 1, 1},
{"áAb", `Aa%`, 1, 1},
{"áAb", `%ab%`, 1, 1},
{"áAb", `%ab`, 1, 1},
{"ÀAb", "aA_", 1, 1},
{"áééá", "a_%a", 1, 1},
{"áééá", "a%_a", 1, 1},
{"áéá", "a_%a", 1, 1},
{"áéá", "a%_a", 1, 1},
{"áá", "a_%a", 0, 0},
{"áá", "a%_a", 0, 0},
{"áééáííí", "a_%a%", 1, 1},

// performs matching on a per-character basis
// https://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html#operator_like
{"ß", "s%", 1, 0},
{"ß", "%s", 1, 0},
{"ß", "ss", 0, 0},
{"ß", "s", 1, 0},
{"ss", "%ß%", 1, 0},
{"ß", "_", 1, 1},
{"ß", "__", 0, 0},
}
for _, tt := range tests {
commentf := Commentf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
Expand All @@ -88,8 +99,20 @@ func (s *testEvaluatorSerialSuites) TestCILike(c *C) {
c.Assert(err, IsNil, commentf)
f.setCollator(collate.GetCollator("utf8mb4_general_ci"))
r, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.generalMatch))
}

for _, tt := range tests {
commentf := Commentf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
fc := funcs[ast.Like]
inputs := s.datumsToConstants(types.MakeDatums(tt.input, tt.pattern, 0))
f, err := fc.getFunction(s.ctx, inputs)
c.Assert(err, IsNil, commentf)
f.setCollator(collate.GetCollator("utf8mb4_unicode_ci"))
r, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil, commentf)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.match), commentf)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.unicodeMatch), commentf)
}
}

Expand Down
72 changes: 48 additions & 24 deletions expression/builtin_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2375,13 +2375,40 @@ func (s *testEvaluatorSerialSuites) TestCIWeightString(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)

fc := funcs[ast.WeightString]
tests := []struct {
type weightStringTest struct {
str string
padding string
length int
expect interface{}
}{
}

checkResult := func(collation string, tests []weightStringTest) {
fc := funcs[ast.WeightString]
for _, test := range tests {
str := types.NewCollationStringDatum(test.str, collation, utf8.RuneCountInString(test.str))
var f builtinFunc
var err error
if test.padding == "NONE" {
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str}))
} else {
padding := types.NewDatum(test.padding)
length := types.NewDatum(test.length)
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str, padding, length}))
}
c.Assert(err, IsNil)
result, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
if result.IsNull() {
c.Assert(test.expect, IsNil)
continue
}
res, err := result.ToString()
c.Assert(err, IsNil)
c.Assert(res, Equals, test.expect)
}
}

generalTests := []weightStringTest{
{"aAÁàãăâ", "NONE", 0, "\x00A\x00A\x00A\x00A\x00A\x00A\x00A"},
{"中", "NONE", 0, "\x4E\x2D"},
{"a", "CHAR", 5, "\x00A"},
Expand All @@ -2398,26 +2425,23 @@ func (s *testEvaluatorSerialSuites) TestCIWeightString(c *C) {
{"中", "BINARY", 5, "中\x00\x00"},
}

for _, test := range tests {
str := types.NewCollationStringDatum(test.str, "utf8mb4_general_ci", utf8.RuneCountInString(test.str))
var f builtinFunc
var err error
if test.padding == "NONE" {
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str}))
} else {
padding := types.NewDatum(test.padding)
length := types.NewDatum(test.length)
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str, padding, length}))
}
c.Assert(err, IsNil)
result, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
if result.IsNull() {
c.Assert(test.expect, IsNil)
continue
}
res, err := result.ToString()
c.Assert(err, IsNil)
c.Assert(res, Equals, test.expect)
unicodeTests := []weightStringTest{
{"aAÁàãăâ", "NONE", 0, "\x0e3\x0e3\x0e3\x0e3\x0e3\x0e3\x0e3"},
{"中", "NONE", 0, "\xfb\x40\xce\x2d"},
{"a", "CHAR", 5, "\x0e3"},
{"a ", "CHAR", 5, "\x0e3"},
{"中", "CHAR", 5, "\xfb\x40\xce\x2d"},
{"中 ", "CHAR", 5, "\xfb\x40\xce\x2d"},
{"a", "BINARY", 1, "a"},
{"ab", "BINARY", 1, "a"},
{"a", "BINARY", 5, "a\x00\x00\x00\x00"},
{"a ", "BINARY", 5, "a \x00\x00\x00"},
{"中", "BINARY", 1, "\xe4"},
{"中", "BINARY", 2, "\xe4\xb8"},
{"中", "BINARY", 3, "中"},
{"中", "BINARY", 5, "中\x00\x00"},
}

checkResult("utf8mb4_general_ci", generalTests)
checkResult("utf8mb4_unicode_ci", unicodeTests)
}
10 changes: 10 additions & 0 deletions expression/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ func (s *testCollationSuites) TestCompareString(c *C) {
c.Assert(types.CompareString("À", "A", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("😜", "😃", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("a ", "a ", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "s", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "ss", "utf8_general_ci"), Not(Equals), 0)

c.Assert(types.CompareString("a", "A", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("À", "A", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("😜", "😃", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("a ", "a ", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "s", "utf8_unicode_ci"), Not(Equals), 0)
c.Assert(types.CompareString("ß", "ss", "utf8_unicode_ci"), Equals, 0)

c.Assert(types.CompareString("a", "A", "binary"), Not(Equals), 0)
c.Assert(types.CompareString("À", "A", "binary"), Not(Equals), 0)
c.Assert(types.CompareString("😜", "😃", "binary"), Not(Equals), 0)
Expand Down
29 changes: 27 additions & 2 deletions expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6136,6 +6136,7 @@ func (s *testIntegrationSerialSuite) TestWeightString(c *C) {
// test explicit collation
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_general_ci);").Rows()[0][0], Equals, "\x4E\x2D")
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_bin);").Rows()[0][0], Equals, "中")
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_unicode_ci);").Rows()[0][0], Equals, "\xFB\x40\xCE\x2D")
c.Assert(tk.MustQuery("select collation(a collate utf8mb4_general_ci) from t order by id").Rows()[0][0], Equals, "utf8mb4_general_ci")
c.Assert(tk.MustQuery("select collation('中 ' collate utf8mb4_general_ci);").Rows()[0][0], Equals, "utf8mb4_general_ci")
rows = tk.MustQuery("select weight_string(a collate utf8mb4_bin) from t order by id").Rows()
Expand All @@ -6159,8 +6160,23 @@ func (s *testIntegrationSerialSuite) TestCollationCreateIndex(c *C) {
tk.MustExec("insert into t values ('B');")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('ß');")
tk.MustExec("insert into t values ('sa');")
tk.MustExec("create index idx on t(a);")
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B"))
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B", "ß", "sa"))

tk.MustExec("drop table if exists t")
tk.MustExec("create table t (a varchar(10) collate utf8mb4_unicode_ci);")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('b');")
tk.MustExec("insert into t values ('B');")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('ß');")
tk.MustExec("insert into t values ('sa');")
tk.MustExec("create index idx on t(a);")
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B", "sa", "ß"))
}

func (s *testIntegrationSerialSuite) TestCollateConstantPropagation(c *C) {
Expand Down Expand Up @@ -6328,6 +6344,8 @@ func (s *testIntegrationSerialSuite) TestCollateSort(c *C) {
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustQuery("select * from t order by a collate utf8mb4_bin").Check(testkit.Rows("A", "A", "A", "a", "a", "a", "b", "b", "b"))
tk.MustQuery("select * from t order by a collate utf8mb4_general_ci").Check(testkit.Rows("a", "A", "a", "A", "a", "A", "b", "b", "b"))
tk.MustQuery("select * from t order by a collate utf8mb4_unicode_ci").Check(testkit.Rows("a", "A", "a", "A", "a", "A", "b", "b", "b"))
}

func (s *testIntegrationSerialSuite) TestCollateHashAgg(c *C) {
Expand All @@ -6348,7 +6366,10 @@ func (s *testIntegrationSerialSuite) TestCollateHashAgg(c *C) {
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustQuery("select count(1) from t group by a collate utf8mb4_bin").Check(testkit.Rows("3", "3", "3"))
tk.MustExec("insert into t values ('s'), ('ss'), ('ß')")
tk.MustQuery("select count(1) from t group by a collate utf8mb4_bin order by a collate utf8mb4_bin").Check(testkit.Rows("3", "3", "3", "1", "1", "1"))
tk.MustQuery("select count(1) from t group by a collate utf8mb4_unicode_ci order by a collate utf8mb4_unicode_ci").Check(testkit.Rows("6", "3", "1", "2"))
tk.MustQuery("select count(1) from t group by a collate utf8mb4_general_ci order by a collate utf8mb4_general_ci").Check(testkit.Rows("6", "3", "2", "1"))
}

func (s *testIntegrationSerialSuite) TestCollateStreamAgg(c *C) {
Expand Down Expand Up @@ -6414,6 +6435,8 @@ func (s *testIntegrationSerialSuite) TestCollateStringFunction(c *C) {
tk.MustQuery("select field('a', 'b', 'A');").Check(testkit.Rows("0"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_bin);").Check(testkit.Rows("0"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_bin);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))

Expand All @@ -6429,6 +6452,8 @@ func (s *testIntegrationSerialSuite) TestCollateStringFunction(c *C) {
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_bin);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,A,c,d' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,A,c,d' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))

tk.MustExec("select concat('a' collate utf8mb4_bin, 'b' collate utf8mb4_bin);")
tk.MustGetErrMsg("select concat('a' collate utf8mb4_bin, 'b' collate utf8mb4_general_ci);", "[expression:1267]Illegal mix of collations (utf8mb4_bin,EXPLICIT) and (utf8mb4_general_ci,EXPLICIT) for operation 'concat'")
Expand Down
15 changes: 15 additions & 0 deletions types/enum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ func (s *testEnumSuite) TestEnum(c *C) {
c.Assert(e.String(), Equals, t.Elems[t.Expected-1])
c.Assert(e.ToNumber(), Equals, float64(t.Expected))
}

for _, t := range tbl {
e, err := ParseEnumName(t.Elems, t.Name, "utf8_unicode_ci")
if t.Expected == 0 {
c.Assert(err, NotNil)
c.Assert(e.ToNumber(), Equals, float64(0))
c.Assert(e.String(), Equals, "")
continue
}

c.Assert(err, IsNil)
c.Assert(e.String(), Equals, t.Elems[t.Expected-1])
c.Assert(e.ToNumber(), Equals, float64(t.Expected))
}

for _, t := range citbl {
e, err := ParseEnumName(t.Elems, t.Name, "utf8_general_ci")
if t.Expected == 0 {
Expand Down
8 changes: 8 additions & 0 deletions types/set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ func (s *testSetSuite) TestSet(c *C) {
c.Assert(e.ToNumber(), Equals, float64(t.ExpectedValue))
c.Assert(e.String(), Equals, t.ExpectedName)
}

for _, t := range tbl {
e, err := ParseSetName(elems, t.Name, "utf8_unicode_ci")
c.Assert(err, IsNil)
c.Assert(e.ToNumber(), Equals, float64(t.ExpectedValue))
c.Assert(e.String(), Equals, t.ExpectedName)
}

for _, t := range citbl {
e, err := ParseSetName(elems, t.Name, "utf8_general_ci")
c.Assert(err, IsNil)
Expand Down
Loading