Skip to content
This repository has been archived by the owner on Dec 8, 2021. It is now read-only.

Commit

Permalink
mydump: added character-set config to control schema decoding behavior
Browse files Browse the repository at this point in the history
  • Loading branch information
kennytm committed Nov 15, 2018
1 parent ba9350e commit 3ca4246
Show file tree
Hide file tree
Showing 28 changed files with 473 additions and 30 deletions.
6 changes: 5 additions & 1 deletion lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import (
"runtime"

"github.com/BurntSushi/toml"
"github.com/pkg/errors"
"github.com/pingcap/tidb-lightning/lightning/common"
"github.com/pkg/errors"
)

const (
Expand Down Expand Up @@ -79,6 +79,7 @@ type MydumperRuntime struct {
MinRegionSize int64 `toml:"region-min-size" json:"region-min-size"`
SourceDir string `toml:"data-source-dir" json:"data-source-dir"`
NoSchema bool `toml:"no-schema" json:"no-schema"`
CharacterSet string `toml:"character-set" json:"character-set"`
}

type TikvImporter struct {
Expand Down Expand Up @@ -153,6 +154,9 @@ func (cfg *Config) Load() error {
if cfg.Mydumper.ReadBlockSize <= 0 {
cfg.Mydumper.ReadBlockSize = ReadBlockSize
}
if len(cfg.Mydumper.CharacterSet) == 0 {
cfg.Mydumper.CharacterSet = "auto"
}

// hendle kv import
if cfg.TikvImporter.BatchSize <= 0 {
Expand Down
16 changes: 12 additions & 4 deletions lightning/mydump/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type MDDatabaseMeta struct {
Name string
SchemaFile string
Tables map[string]*MDTableMeta
charSet string
}

func (m *MDDatabaseMeta) String() string {
Expand All @@ -31,7 +32,7 @@ func (m *MDDatabaseMeta) String() string {
}

func (m *MDDatabaseMeta) GetSchema() string {
schema, err := ExportStatement(m.SchemaFile)
schema, err := ExportStatement(m.SchemaFile, m.charSet)
if err != nil {
common.AppLogger.Errorf("failed to extract database schema (%s) : %s", m.SchemaFile, err.Error())
return ""
Expand All @@ -44,10 +45,11 @@ type MDTableMeta struct {
Name string
SchemaFile string
DataFiles []string
charSet string
}

func (m *MDTableMeta) GetSchema() string {
schema, err := ExportStatement(m.SchemaFile)
schema, err := ExportStatement(m.SchemaFile, m.charSet)
if err != nil {
common.AppLogger.Errorf("failed to extract table schema (%s) : %s", m.SchemaFile, err.Error())
return ""
Expand All @@ -62,13 +64,15 @@ type MDLoader struct {
dir string
noSchema bool
dbs map[string]*MDDatabaseMeta
charSet string
}

func NewMyDumpLoader(cfg *config.Config) (*MDLoader, error) {
mdl := &MDLoader{
dir: cfg.Mydumper.SourceDir,
noSchema: cfg.Mydumper.NoSchema,
dbs: make(map[string]*MDDatabaseMeta),
charSet: cfg.Mydumper.CharacterSet,
}

if err := mdl.setup(mdl.dir); err != nil {
Expand Down Expand Up @@ -122,6 +126,7 @@ func (l *MDLoader) setupDBs(files map[string]string) error {
Name: dbname,
SchemaFile: fpath,
Tables: make(map[string]*MDTableMeta),
charSet: l.charSet,
}
}

Expand Down Expand Up @@ -159,6 +164,7 @@ func (l *MDLoader) setupTables(files map[string]string) error {
Name: table,
SchemaFile: fpath,
DataFiles: make([]string, 0, 16),
charSet: l.charSet,
}
}
}
Expand Down Expand Up @@ -199,8 +205,9 @@ func (l *MDLoader) setupTablesData(files map[string]string) error {
return errors.Errorf("invalid data sql file, miss host db - %s", fpath)
}
dbMeta = &MDDatabaseMeta{
Name: db,
Tables: make(map[string]*MDTableMeta),
Name: db,
Tables: make(map[string]*MDTableMeta),
charSet: l.charSet,
}
l.dbs[db] = dbMeta
}
Expand All @@ -213,6 +220,7 @@ func (l *MDLoader) setupTablesData(files map[string]string) error {
DB: db,
Name: table,
DataFiles: make([]string, 0, 16),
charSet: l.charSet,
}
dbMeta.Tables[table] = tableMeta
}
Expand Down
58 changes: 37 additions & 21 deletions lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (
"strings"
"unicode/utf8"

"github.com/pkg/errors"
"github.com/pingcap/tidb-lightning/lightning/common"
"github.com/pkg/errors"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/simplifiedchinese"
)
Expand All @@ -21,17 +21,45 @@ var (

var (
ErrInsertStatementNotFound = errors.New("insert statement not found")
errInvalidSchemaEncoding = errors.New("invalid schema encoding")
)

var (
// if there are too many encodings involved, consider switching strategy to
// perform `chardet` first.
supportedSchemaEncodings = []encoding.Encoding{
simplifiedchinese.GB18030,
}
)

func ExportStatement(sqlFile string) ([]byte, error) {
func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
switch characterSet {
case "binary":
default:
if utf8.Valid(data) {
break
}
if characterSet == "utf8mb4" {
return nil, errInvalidSchemaEncoding
}
// try gb18030 next if the encoding is "auto"
// if we support too many encodings, consider switching strategy to
// perform `chardet` first.
fallthrough
case "gb18030":
decoded, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data)
if err != nil {
return nil, errors.Trace(err)
}
// check for U+FFFD to see if decoding contains errors.
// https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ
if bytes.ContainsRune(decoded, '\ufffd') {
return nil, errInvalidSchemaEncoding
}
data = decoded
}
return data, nil
}

func ExportStatement(sqlFile string, characterSet string) ([]byte, error) {
fd, err := os.Open(sqlFile)
if err != nil {
return nil, errors.Trace(err)
Expand Down Expand Up @@ -72,24 +100,12 @@ func ExportStatement(sqlFile string) ([]byte, error) {
}
}

if utf8.Valid(data) {
return data, nil
}

for _, encoding := range supportedSchemaEncodings {
decoded, err := encoding.NewDecoder().Bytes(data)
if err != nil {
return nil, errors.Trace(err)
}
// check for U+FFFD to see if decoding contains errors.
// https://groups.google.com/d/msg/golang-nuts/pENT3i4zJYk/v2X3yyiICwAJ
if !bytes.ContainsRune(decoded, '\ufffd') {
return decoded, nil
}
data, err = decodeCharacterSet(data, characterSet)
if err != nil {
common.AppLogger.Errorf("cannot guess encoding for input file, please convert to UTF-8 manually: %s", sqlFile)
return nil, errors.Annotatef(err, "failed to decode %s", sqlFile)
}

common.AppLogger.Errorf("cannot guess encoding for input file, please convert to UTF-8 manually: %s", sqlFile)
return nil, errors.New("invalid schema encoding")
return data, nil
}

type MDDataReader struct {
Expand Down
6 changes: 3 additions & 3 deletions lightning/mydump/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func (s *testMydumpReaderSuite) TestExportStatementNoTrailingNewLine(c *C) {
err = file.Close()
c.Assert(err, IsNil)

data, err := ExportStatement(file.Name())
data, err := ExportStatement(file.Name(), "auto")
c.Assert(err, IsNil)
c.Assert(data, DeepEquals, []byte("CREATE DATABASE whatever;"))
}
Expand All @@ -47,7 +47,7 @@ func (s *testMydumpReaderSuite) TestExportStatementGBK(c *C) {
err = file.Close()
c.Assert(err, IsNil)

data, err := ExportStatement(file.Name())
data, err := ExportStatement(file.Name(), "auto")
c.Assert(err, IsNil)
c.Assert(data, DeepEquals, []byte("CREATE TABLE a (b int(11) COMMENT '总案例');"))
}
Expand All @@ -62,7 +62,7 @@ func (s *testMydumpReaderSuite) TestExportStatementGibberishError(c *C) {
err = file.Close()
c.Assert(err, IsNil)

data, err := ExportStatement(file.Name())
data, err := ExportStatement(file.Name(), "auto")
c.Assert(data, IsNil)
c.Assert(err, NotNil)
}
2 changes: 1 addition & 1 deletion lightning/sql/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ func (s *testParserSuite) testParseRealFile(c *C) {
c.Assert(err, IsNil)
dbMeta := loader.GetDatabases()["mocker_test"]
for _, tblMeta := range dbMeta.Tables {
sqlCreteTable, _ := ExportStatement(tblMeta.SchemaFile)
sqlCreteTable, _ := ExportStatement(tblMeta.SchemaFile, "auto")
store.init(string(sqlCreteTable))

// read from file
Expand Down
28 changes: 28 additions & 0 deletions tests/character_sets/gb18030-auto.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[lightning]
table-concurrency = 1
check-requirements = false
file = "/tmp/lightning_test_result/lightning.log"
level = "error"

[checkpoint]
enable = false

[tikv-importer]
addr = "127.0.0.1:8808"

[mydumper]
data-source-dir = "tests/character_sets/gb18030"
character-set = "auto"

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
status-port = 10080
pd-addr = "127.0.0.1:2379"
log-level = "error"

[post-restore]
checksum = true
compact = false
analyze = false
28 changes: 28 additions & 0 deletions tests/character_sets/gb18030-binary.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[lightning]
table-concurrency = 1
check-requirements = false
file = "/tmp/lightning_test_result/lightning.log"
level = "error"

[checkpoint]
enable = false

[tikv-importer]
addr = "127.0.0.1:8808"

[mydumper]
data-source-dir = "tests/character_sets/gb18030"
character-set = "binary"

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
status-port = 10080
pd-addr = "127.0.0.1:2379"
log-level = "error"

[post-restore]
checksum = true
compact = false
analyze = false
28 changes: 28 additions & 0 deletions tests/character_sets/gb18030-gb18030.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[lightning]
table-concurrency = 1
check-requirements = false
file = "/tmp/lightning_test_result/lightning.log"
level = "error"

[checkpoint]
enable = false

[tikv-importer]
addr = "127.0.0.1:8808"

[mydumper]
data-source-dir = "tests/character_sets/gb18030"
character-set = "gb18030"

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
status-port = 10080
pd-addr = "127.0.0.1:2379"
log-level = "error"

[post-restore]
checksum = true
compact = false
analyze = false
28 changes: 28 additions & 0 deletions tests/character_sets/gb18030-utf8mb4.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[lightning]
table-concurrency = 1
check-requirements = false
file = "/tmp/lightning_test_result/lightning.log"
level = "error"

[checkpoint]
enable = false

[tikv-importer]
addr = "127.0.0.1:8808"

[mydumper]
data-source-dir = "tests/character_sets/gb18030"
character-set = "utf8mb4"

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
status-port = 10080
pd-addr = "127.0.0.1:2379"
log-level = "error"

[post-restore]
checksum = true
compact = false
analyze = false
1 change: 1 addition & 0 deletions tests/character_sets/gb18030/charsets-schema-create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create database charsets;
1 change: 1 addition & 0 deletions tests/character_sets/gb18030/charsets.gb18030-schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create table gb18030 (`Ö÷¼ü` int primary key comment '×¢ÊÍ');
1 change: 1 addition & 0 deletions tests/character_sets/gb18030/charsets.gb18030.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
insert into gb18030 values (35), (58), (4), (24), (14), (27), (39), (5), (0), (61);
28 changes: 28 additions & 0 deletions tests/character_sets/mixed-auto.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[lightning]
table-concurrency = 1
check-requirements = false
file = "/tmp/lightning_test_result/lightning.log"
level = "error"

[checkpoint]
enable = false

[tikv-importer]
addr = "127.0.0.1:8808"

[mydumper]
data-source-dir = "tests/character_sets/mixed"
character-set = "auto"

[tidb]
host = "127.0.0.1"
port = 4000
user = "root"
status-port = 10080
pd-addr = "127.0.0.1:2379"
log-level = "error"

[post-restore]
checksum = true
compact = false
analyze = false
Loading

0 comments on commit 3ca4246

Please sign in to comment.