From d66d914b5730191a988f9195a522819030c399c9 Mon Sep 17 00:00:00 2001 From: Justin Jaffray Date: Wed, 27 Sep 2017 14:36:09 +0000 Subject: [PATCH] sql: modify BYTES->STRING casts to match postgres Fixes #17969. Also introduce the hex form of ENCODE/DECODE to replace this functionality. Detailed explanation of why this was necessary: https://github.com/cockroachdb/cockroach/issues/17969#issuecomment-331317893 This is a backwards-incompatible (but important) change and I think it should be cherry-picked to 1.1. --- docs/generated/sql/functions.md | 4 ++ pkg/cli/dump_test.go | 6 +-- pkg/sql/logictest/testdata/logic_test/array | 2 +- .../testdata/logic_test/builtin_function | 21 ++++++---- pkg/sql/logictest/testdata/logic_test/bytes | 41 +++++++++++++++++++ pkg/sql/parser/builtins.go | 36 ++++++++++++++++ pkg/sql/parser/constant.go | 2 +- pkg/sql/parser/datum.go | 14 +++++-- pkg/sql/parser/datum_test.go | 4 +- pkg/sql/parser/encode.go | 35 +++++++++++++++- pkg/sql/parser/eval.go | 12 +++--- pkg/sql/parser/eval_test.go | 19 ++++----- pkg/sql/parser/format_test.go | 2 +- 13 files changed, 160 insertions(+), 38 deletions(-) create mode 100644 pkg/sql/logictest/testdata/logic_test/bytes diff --git a/docs/generated/sql/functions.md b/docs/generated/sql/functions.md index 0d1525c8ef32..0d6e362f07ec 100644 --- a/docs/generated/sql/functions.md +++ b/docs/generated/sql/functions.md @@ -473,6 +473,10 @@ Compatible elements: hour, minute, second, millisecond, microsecond.

concat_ws(string...) → string

Uses the first argument as a separator between the concatenation of the subsequent arguments.

For example concat_ws('!','wow','great') returns wow!great.

+decode(text: string, format: string) → string

Decodes data as the format specified by format (only “hex” is supported).

+
+encode(data: bytes, format: string) → string

Encodes data in the text format specified by format (only “hex” is supported).

+
from_ip(val: bytes) → string

Converts the byte string representation of an IP to its character string representation.

from_uuid(val: bytes) → string

Converts the byte string representation of a UUID to its character string representation.

diff --git a/pkg/cli/dump_test.go b/pkg/cli/dump_test.go index 03af427f9ff5..2c82598d9a25 100644 --- a/pkg/cli/dump_test.go +++ b/pkg/cli/dump_test.go @@ -72,7 +72,7 @@ func TestDumpRow(t *testing.T) { 1, 2.3, 'striiing', - b'\141\061\142\062\143\063', '2016-03-26', '2016-01-25 10:10:10' , + '\x613162326333', '2016-03-26', '2016-01-25 10:10:10' , '2h30m30s', true, 1.2345, @@ -124,7 +124,7 @@ CREATE TABLE t ( ); INSERT INTO t (i, f, s, b, d, t, n, o, e, u, ip, tz, e1, e2, s1) VALUES - (1, 2.3, 'striiing', b'a1b2c3', '2016-03-26', '2016-01-25 10:10:10+00:00', '2h30m30s', true, 1.2345, 'e9716c74-2638-443d-90ed-ffde7bea7d1d', '192.168.0.1', '2016-01-25 10:10:10+00:00', 3, 4.5, 's'), + (1, 2.3, 'striiing', '\x613162326333', '2016-03-26', '2016-01-25 10:10:10+00:00', '2h30m30s', true, 1.2345, 'e9716c74-2638-443d-90ed-ffde7bea7d1d', '192.168.0.1', '2016-01-25 10:10:10+00:00', 3, 4.5, 's'), (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), (NULL, '+Inf', NULL, NULL, NULL, NULL, NULL, NULL, 'Infinity', NULL, NULL, NULL, NULL, NULL, NULL), (NULL, '-Inf', NULL, NULL, NULL, NULL, NULL, NULL, '-Infinity', NULL, NULL, NULL, NULL, NULL, NULL), @@ -335,8 +335,6 @@ func init() { func TestDumpRandom(t *testing.T) { defer leaktest.AfterTest(t)() - t.Skip("https://github.com/cockroachdb/cockroach/issues/17969") - c := newCLITest(cliTestParams{t: t}) defer c.cleanup() diff --git a/pkg/sql/logictest/testdata/logic_test/array b/pkg/sql/logictest/testdata/logic_test/array index 6869e4f69031..91435c60a548 100644 --- a/pkg/sql/logictest/testdata/logic_test/array +++ b/pkg/sql/logictest/testdata/logic_test/array @@ -565,7 +565,7 @@ INSERT INTO a VALUES (ARRAY['foo','bar','baz']) query T SELECT b FROM a ---- -{b'foo',b'bar',b'baz'} +{'\x666f6f','\x626172','\x62617a'} statement ok DROP TABLE a diff --git a/pkg/sql/logictest/testdata/logic_test/builtin_function b/pkg/sql/logictest/testdata/logic_test/builtin_function index 95866872e4e9..f85d3360124c 100644 --- a/pkg/sql/logictest/testdata/logic_test/builtin_function +++ b/pkg/sql/logictest/testdata/logic_test/builtin_function @@ -114,11 +114,6 @@ SELECT SUBSTR('RoacH', -2, 4) ---- R -# See #6601 and #8210. We use three arguments to guarantee a very high -# probability that at least one of the UUIDs is invalid UTF8. -query error invalid UTF-8 -select cast(uuid_v4() as string), cast(uuid_v4() as string), cast(uuid_v4() as string) - query T SELECT SUBSTR('12345', 2, 77) ---- @@ -1519,6 +1514,20 @@ SELECT array_upper(ARRAY[ARRAY[1, 2]], 2) ---- 2 +query error only 'hex' format is supported for ENCODE +SELECT ENCODE('abc', 'escape') + +query error only 'hex' format is supported for DECODE +SELECT DECODE('abc', 'escape') + +query error invalid UTF-8 +SELECT ENCODE('\xa7', 'hex') + +query TT +SELECT ENCODE('\x616263', 'hex'), DECODE('abc', 'hex') +---- +abc 616263 + query T SELECT FROM_IP(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x01\x02\x03\x04') ---- @@ -1788,7 +1797,6 @@ SELECT i FROM (VALUES (sha512('3m'::interval::string)), (sha512('2016-11-12'::date::string)), (sha512('2015-08-24 23:45:45.53453'::timestamptz::string)), - (sha512(b'bar':::bytes::string)), (sha512(b'bar'::bytes)) ) AS a(i) ---- @@ -1802,7 +1810,6 @@ f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254 b2d173023893f71caadf7cb2f9557355462570de2c9c971b9cfa5494936e28df8e13d0db4d550aab66d5e7a002f678ddb02def092c069ce473cf5fb293953986 960b0fed9378be1e9adefd91e1be6ac9c1de7208008dfec438ff845135727bebea0f7458a5181079f61288176e0168cfea501b900c3e495b3ab9bbe4d372486d d82c4eb5261cb9c8aa9855edd67d1bd10482f41529858d925094d173fa662aa91ff39bc5b188615273484021dfb16fd8284cf684ccf0fc795be3aa2fc1e6c181 -d82c4eb5261cb9c8aa9855edd67d1bd10482f41529858d925094d173fa662aa91ff39bc5b188615273484021dfb16fd8284cf684ccf0fc795be3aa2fc1e6c181 # We only support one encoding, UTF8, which is hardcoded to id 6 just like in # Postgres. diff --git a/pkg/sql/logictest/testdata/logic_test/bytes b/pkg/sql/logictest/testdata/logic_test/bytes new file mode 100644 index 000000000000..414ff9677d86 --- /dev/null +++ b/pkg/sql/logictest/testdata/logic_test/bytes @@ -0,0 +1,41 @@ +# LogicTest: default + +query T +SELECT 'non-escaped-string':::BYTES::STRING +---- +\x6e6f6e2d657363617065642d737472696e67 + +query T +SELECT '\Xabcd':::BYTES::STRING +---- +\xabcd + +query T +SELECT b'\x5c\x78':::BYTES +---- +\x + +query T +SELECT b'\x5c\x78':::BYTES::STRING +---- +\x5c78 + +query T +SELECT b'\x5c\x58':::BYTES::STRING +---- +\x5c58 + +query T +SELECT e'\x5c\x78'::STRING +---- +\x + +query T +SELECT '\X':::BYTES::STRING +---- +\x + +query T +SELECT '日本語':::STRING::BYTES::STRING +---- +\xe697a5e69cace8aa9e diff --git a/pkg/sql/parser/builtins.go b/pkg/sql/parser/builtins.go index af53c1cfe148..4297991947f5 100644 --- a/pkg/sql/parser/builtins.go +++ b/pkg/sql/parser/builtins.go @@ -672,6 +672,42 @@ var Builtins = map[string][]Builtin{ }, }, + // https://www.postgresql.org/docs/10/static/functions-binarystring.html + "encode": { + Builtin{ + Types: ArgTypes{{"data", TypeBytes}, {"format", TypeString}}, + ReturnType: fixedReturnType(TypeString), + fn: func(evalCtx *EvalContext, args Datums) (_ Datum, err error) { + data, format := string(*args[0].(*DBytes)), string(MustBeDString(args[1])) + if format != "hex" { + return nil, pgerror.NewError(pgerror.CodeInvalidParameterValueError, "only 'hex' format is supported for ENCODE") + } + if !utf8.ValidString(data) { + return nil, pgerror.NewError(pgerror.CodeCharacterNotInRepertoireError, "invalid UTF-8 sequence") + } + return NewDString(data), nil + }, + Info: "Encodes `data` in the text format specified by `format` (only \"hex\" is supported).", + }, + }, + + "decode": { + Builtin{ + Types: ArgTypes{{"text", TypeString}, {"format", TypeString}}, + ReturnType: fixedReturnType(TypeString), + fn: func(evalCtx *EvalContext, args Datums) (_ Datum, err error) { + data, format := string(MustBeDString(args[0])), string(MustBeDString(args[1])) + if format != "hex" { + return nil, pgerror.NewError(pgerror.CodeInvalidParameterValueError, "only 'hex' format is supported for DECODE") + } + var buf bytes.Buffer + hexEncodeString(&buf, data) + return NewDString(buf.String()), nil + }, + Info: "Decodes `data` as the format specified by `format` (only \"hex\" is supported).", + }, + }, + "ascii": {stringBuiltin1(func(_ *EvalContext, s string) (Datum, error) { for _, ch := range s { return NewDInt(DInt(ch)), nil diff --git a/pkg/sql/parser/constant.go b/pkg/sql/parser/constant.go index 2505c14aa750..eab18517e854 100644 --- a/pkg/sql/parser/constant.go +++ b/pkg/sql/parser/constant.go @@ -410,7 +410,7 @@ func (expr *StrVal) ResolveAsType(ctx *SemaContext, typ Type) (Datum, error) { expr.resString = DString(expr.s) return NewDNameFromDString(&expr.resString), nil case TypeBytes: - s, err := ParseDByte(expr.s) + s, err := ParseDByte(expr.s, !expr.bytesEsc) if err == nil { expr.resBytes = *s } diff --git a/pkg/sql/parser/datum.go b/pkg/sql/parser/datum.go index 692915ae89a6..7992d67cc6e7 100644 --- a/pkg/sql/parser/datum.go +++ b/pkg/sql/parser/datum.go @@ -206,8 +206,9 @@ func ParseDBool(s string) (*DBool, error) { } // ParseDByte parses a string representation of hex encoded binary data. -func ParseDByte(s string) (*DBytes, error) { - if len(s) > 2 && (s[0] == '\\' && (s[1] == 'x' || s[1] == 'X')) { +// allowBackslashXFormat determines if the `\x` format of inputting a hex string should be allowed. +func ParseDByte(s string, allowBackslashXFormat bool) (*DBytes, error) { + if allowBackslashXFormat && len(s) >= 2 && (s[0] == '\\' && (s[1] == 'x' || s[1] == 'X')) { hexstr, err := hex.DecodeString(s[2:]) if err != nil { return nil, makeParseError(s, TypeBytes, err) @@ -1011,11 +1012,16 @@ func (d *DBytes) max() (Datum, bool) { } // AmbiguousFormat implements the Datum interface. -func (*DBytes) AmbiguousFormat() bool { return false } +func (*DBytes) AmbiguousFormat() bool { return true } // Format implements the NodeFormatter interface. func (d *DBytes) Format(buf *bytes.Buffer, f FmtFlags) { - encodeSQLBytes(buf, string(*d)) + buf.WriteString("'\\x") + b := string(*d) + for i := 0; i < len(b); i++ { + buf.Write(rawHexMap[b[i]]) + } + buf.WriteByte('\'') } // Size implements the Datum interface. diff --git a/pkg/sql/parser/datum_test.go b/pkg/sql/parser/datum_test.go index 64cd20524eb4..ca11a4ad5257 100644 --- a/pkg/sql/parser/datum_test.go +++ b/pkg/sql/parser/datum_test.go @@ -82,8 +82,8 @@ func TestDatumOrdering(t *testing.T) { {`'':::string`, valIsMin, `e'\x00'`, `''`, noMax}, {`e'\x00'`, noPrev, `e'\x00\x00'`, `''`, noMax}, {`'abc':::string`, noPrev, `e'abc\x00'`, `''`, noMax}, - {`'':::bytes`, valIsMin, `b'\x00'`, `b''`, noMax}, - {`'abc':::bytes`, noPrev, `b'abc\x00'`, `b''`, noMax}, + {`'':::bytes`, valIsMin, `'\x00'`, `'\x'`, noMax}, + {`'abc':::bytes`, noPrev, `'\x61626300'`, `'\x'`, noMax}, // Dates {`'2006-01-02':::date`, `'2006-01-01'`, `'2006-01-03'`, noMin, noMax}, diff --git a/pkg/sql/parser/encode.go b/pkg/sql/parser/encode.go index f6a5d862d9f8..9fc76eef5b10 100644 --- a/pkg/sql/parser/encode.go +++ b/pkg/sql/parser/encode.go @@ -41,7 +41,10 @@ var ( '{': true, '}': true, } + // hexMap is a mapping from each byte to the `\x%%` hex form as a []byte. hexMap [256][]byte + // rawHexMap is a mapping from each byte to the `%%` hex form as a []byte. + rawHexMap [256][]byte ) // encodeSQLString writes a string literal to buf. All unicode and @@ -59,6 +62,12 @@ func EscapeSQLString(in string) string { return buf.String() } +func hexEncodeString(buf *bytes.Buffer, in string) { + for i := 0; i < len(in); i++ { + buf.Write(rawHexMap[in[i]]) + } +} + // encodeEscapedChar is used internally to write out a character from a larger // string that needs to be escaped to a buffer. func encodeEscapedChar( @@ -239,7 +248,29 @@ func init() { } } - for i := range hexMap { - hexMap[i] = []byte(fmt.Sprintf("\\x%02x", i)) + // underlyingHexMap contains the string "\x00\x01\x02..." which hexMap and + // rawHexMap then index into. + var underlyingHexMap bytes.Buffer + underlyingHexMap.Grow(1024) + + for i := 0; i < 256; i++ { + underlyingHexMap.WriteString("\\x") + writeHexDigit(&underlyingHexMap, i/16) + writeHexDigit(&underlyingHexMap, i%16) + } + + underlyingHexBytes := underlyingHexMap.Bytes() + + for i := 0; i < 256; i++ { + hexMap[i] = underlyingHexBytes[i*4 : i*4+4] + rawHexMap[i] = underlyingHexBytes[i*4+2 : i*4+4] + } +} + +func writeHexDigit(buf *bytes.Buffer, v int) { + if v < 10 { + buf.WriteByte('0' + byte(v)) + } else { + buf.WriteByte('a' + byte(v-10)) } } diff --git a/pkg/sql/parser/eval.go b/pkg/sql/parser/eval.go index 8782a24ec10e..2099a8e04fa3 100644 --- a/pkg/sql/parser/eval.go +++ b/pkg/sql/parser/eval.go @@ -15,6 +15,7 @@ package parser import ( + "bytes" "fmt" "math" "math/big" @@ -2440,11 +2441,10 @@ func performCast(ctx *EvalContext, d Datum, t CastTargetType) (Datum, error) { case *DCollatedString: s = t.Contents case *DBytes: - if !utf8.ValidString(string(*t)) { - return nil, pgerror.NewErrorf( - pgerror.CodeCharacterNotInRepertoireError, "invalid UTF-8: %q", string(*t)) - } - s = string(*t) + var buf bytes.Buffer + buf.WriteString("\\x") + hexEncodeString(&buf, string(*t)) + s = buf.String() case *DOid: s = t.name } @@ -2470,7 +2470,7 @@ func performCast(ctx *EvalContext, d Datum, t CastTargetType) (Datum, error) { case *BytesColType: switch t := d.(type) { case *DString: - return ParseDByte(string(*t)) + return ParseDByte(string(*t), true) case *DCollatedString: return NewDBytes(DBytes(t.Contents)), nil case *DUuid: diff --git a/pkg/sql/parser/eval_test.go b/pkg/sql/parser/eval_test.go index 9b1b421741a5..d299039b0005 100644 --- a/pkg/sql/parser/eval_test.go +++ b/pkg/sql/parser/eval_test.go @@ -100,17 +100,17 @@ func TestEval(t *testing.T) { {`0xa`, `10`}, {`0xcafe1111`, `3405648145`}, // Hexadecimal bytes literals. - {`x'636174'`, `b'cat'`}, - {`X'636174'`, `b'cat'`}, + {`x'636174'`, `'\x636174'`}, + {`X'636174'`, `'\x636174'`}, {`x'636174'::string`, `'cat'`}, - {`e'\\x636174'::BYTES`, `b'cat'`}, - {`e'\\X636174'::BYTES`, `b'cat'`}, - {`e'\\x636174'::STRING::BYTES`, `b'cat'`}, + {`e'\\x636174'::BYTES`, `'\x636174'`}, + {`e'\\X636174'::BYTES`, `'\x636174'`}, + {`e'\\x636174'::STRING::BYTES`, `'\x636174'`}, {`e'\\x636174'::STRING`, `e'\\x636174'`}, // String concatenation. {`'a' || 'b'`, `'ab'`}, {`'a' || (1 + 2)::char`, `'a3'`}, - {`b'hello' || 'world'`, `b'helloworld'`}, + {`b'hello' || 'world'`, `'\x68656c6c6f776f726c64'`}, // Bit shift operators. {`1 << 2`, `4`}, {`4 >> 2`, `1`}, @@ -606,9 +606,9 @@ func TestEval(t *testing.T) { {`CAST('123' AS int) + 1`, `124`}, {`CAST(NULL AS int)`, `NULL`}, {`'hello'::char(2)`, `'he'`}, - {`'hello'::bytes`, `b'hello'`}, + {`'hello'::bytes`, `'\x68656c6c6f'`}, {`b'hello'::string`, `'hello'`}, - {`b'\xff'`, `b'\xff'`}, + {`b'\xff'`, `'\xff'`}, {`123::text`, `'123'`}, {`date '2010-09-28'`, `'2010-09-28'`}, {`CAST('2010-09-28' AS date)`, `'2010-09-28'`}, @@ -673,7 +673,7 @@ func TestEval(t *testing.T) { {`10123456::int::interval`, `'10s123ms456µs'`}, // Type annotation expressions. {`ANNOTATE_TYPE('s', string)`, `'s'`}, - {`ANNOTATE_TYPE('s', bytes)`, `b's'`}, + {`ANNOTATE_TYPE('s', bytes)`, `'\x73'`}, {`ANNOTATE_TYPE('2010-09-28', date)`, `'2010-09-28'`}, {`ANNOTATE_TYPE('PT12H2M', interval)`, `'12h2m'`}, {`ANNOTATE_TYPE('2 02:12', interval)`, `'2d2h12m'`}, @@ -1063,7 +1063,6 @@ func TestEvalError(t *testing.T) { `could not parse "abcd" as type interval: interval: missing unit`}, {`'1- 2:3:4 9'::interval`, `could not parse "1- 2:3:4 9" as type interval: invalid input syntax for type interval 1- 2:3:4 9`}, - {`b'\xff\xfe\xfd'::string`, `invalid UTF-8: "\xff\xfe\xfd"`}, {`e'\\x6sdfsd36174'::BYTES`, `could not parse "\\x6sdfsd36174" as type bytes: encoding/hex: odd length hex string`}, {`ARRAY[NULL, ARRAY[1, 2]]`, `multidimensional arrays must have array expressions with matching dimensions`}, {`ARRAY[ARRAY[1, 2], NULL]`, `multidimensional arrays must have array expressions with matching dimensions`}, diff --git a/pkg/sql/parser/format_test.go b/pkg/sql/parser/format_test.go index 3ed6203859a6..a8369cb0eba8 100644 --- a/pkg/sql/parser/format_test.go +++ b/pkg/sql/parser/format_test.go @@ -131,7 +131,7 @@ func TestFormatExpr(t *testing.T) { {`'abc'`, FmtShowTypes, `('abc')[string]`}, {`b'abc'`, FmtShowTypes, - `(b'abc')[bytes]`}, + `('\x616263')[bytes]`}, {`interval '3s'`, FmtShowTypes, `('3s')[interval]`}, {`date '2003-01-01'`, FmtShowTypes,