Skip to content

Commit

Permalink
[SPARK-40924][SQL] Fix for Unhex when input has odd number of symbols
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Fix for a bug in `Unhex` function when there is an odd number of symbols in the input string.

### Why are the changes needed?

`Unhex` function and other functions depending on it (e.g. `ToBinary`) produce incorrect output.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Unit tests

Closes #38402 from vitaliili-db/unhex.

Authored-by: Vitalii Li <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
  • Loading branch information
vitaliili-db authored and MaxGekk committed Oct 27, 2022
1 parent 5b5eb23 commit 276abe3
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,7 @@ object Hex {
def unhex(bytes: Array[Byte]): Array[Byte] = {
val out = new Array[Byte]((bytes.length + 1) >> 1)
var i = 0
var oddShift = 0
if ((bytes.length & 0x01) != 0) {
// padding with '0'
if (bytes(0) < 0) {
Expand All @@ -1047,6 +1048,7 @@ object Hex {
}
out(0) = v
i += 1
oddShift = 1
}
// two characters form the hex value.
while (i < bytes.length) {
Expand All @@ -1058,7 +1060,7 @@ object Hex {
if (first == -1 || second == -1) {
return null
}
out(i / 2) = (((first << 4) | second) & 0xFF).toByte
out(i / 2 + oddShift) = (((first << 4) | second) & 0xFF).toByte
i += 2
}
out
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,8 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(Unhex(Literal("F")), Array[Byte](15))
checkEvaluation(Unhex(Literal("ff")), Array[Byte](-1))
checkEvaluation(Unhex(Literal("GG")), null)
checkEvaluation(Unhex(Literal("123")), Array[Byte](1, 35))
checkEvaluation(Unhex(Literal("12345")), Array[Byte](1, 35, 69))
// scalastyle:off
// Turn off scala style for non-ascii chars
checkEvaluation(Unhex(Literal("E4B889E9878DE79A84")), "三重的".getBytes(StandardCharsets.UTF_8))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ SELECT rpad('hi', 'invalid_length');
SELECT hex(lpad(unhex(''), 5));
SELECT hex(lpad(unhex('aabb'), 5));
SELECT hex(lpad(unhex('aabbcc'), 2));
SELECT hex(lpad(unhex('123'), 2));
SELECT hex(lpad(unhex('12345'), 2));
SELECT hex(lpad(unhex(''), 5, unhex('1f')));
SELECT hex(lpad(unhex('aa'), 5, unhex('1f')));
SELECT hex(lpad(unhex('aa'), 6, unhex('1f')));
Expand All @@ -97,6 +99,8 @@ SELECT hex(lpad(unhex('aabbcc'), 2, unhex('ff')));
SELECT hex(rpad(unhex(''), 5));
SELECT hex(rpad(unhex('aabb'), 5));
SELECT hex(rpad(unhex('aabbcc'), 2));
SELECT hex(rpad(unhex('123'), 2));
SELECT hex(rpad(unhex('12345'), 2));
SELECT hex(rpad(unhex(''), 5, unhex('1f')));
SELECT hex(rpad(unhex('aa'), 5, unhex('1f')));
SELECT hex(rpad(unhex('aa'), 6, unhex('1f')));
Expand Down Expand Up @@ -202,6 +206,8 @@ select to_binary('737472696E67', 'hex');
select to_binary('');
select to_binary('1', 'hex');
select to_binary('FF');
select to_binary('123', 'hex');
select to_binary('12345', 'hex');
-- hex invalid
select to_binary('GG');
select to_binary('01 AF', 'hex');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ select try_to_binary('737472696E67', 'hex');
select try_to_binary('');
select try_to_binary('1', 'hex');
select try_to_binary('FF');
select try_to_binary('123');
select try_to_binary('12345');
-- hex invalid
select try_to_binary('GG');
select try_to_binary('01 AF', 'hex');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,22 @@ struct<hex(lpad(unhex(aabbcc), 2, X'00')):string>
AABB


-- !query
SELECT hex(lpad(unhex('123'), 2))
-- !query schema
struct<hex(lpad(unhex(123), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(lpad(unhex('12345'), 2))
-- !query schema
struct<hex(lpad(unhex(12345), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(lpad(unhex(''), 5, unhex('1f')))
-- !query schema
Expand Down Expand Up @@ -648,6 +664,22 @@ struct<hex(rpad(unhex(aabbcc), 2, X'00')):string>
AABB


-- !query
SELECT hex(rpad(unhex('123'), 2))
-- !query schema
struct<hex(rpad(unhex(123), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(rpad(unhex('12345'), 2))
-- !query schema
struct<hex(rpad(unhex(12345), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(rpad(unhex(''), 5, unhex('1f')))
-- !query schema
Expand Down Expand Up @@ -1408,6 +1440,22 @@ struct<to_binary(FF):binary>


-- !query
select to_binary('123', 'hex')
-- !query schema
struct<to_binary(123, hex):binary>
-- !query output
#


-- !query
select to_binary('12345', 'hex')
-- !query schema
struct<to_binary(12345, hex):binary>
-- !query output
#E


-- !query
select to_binary('GG')
-- !query schema
Expand Down Expand Up @@ -1489,7 +1537,8 @@ select to_binary('abc', 'Hex')
-- !query schema
struct<to_binary(abc, Hex):binary>
-- !query output



-- !query
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,22 @@ struct<hex(lpad(unhex(aabbcc), 2, X'00')):string>
AABB


-- !query
SELECT hex(lpad(unhex('123'), 2))
-- !query schema
struct<hex(lpad(unhex(123), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(lpad(unhex('12345'), 2))
-- !query schema
struct<hex(lpad(unhex(12345), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(lpad(unhex(''), 5, unhex('1f')))
-- !query schema
Expand Down Expand Up @@ -580,6 +596,22 @@ struct<hex(rpad(unhex(aabbcc), 2, X'00')):string>
AABB


-- !query
SELECT hex(rpad(unhex('123'), 2))
-- !query schema
struct<hex(rpad(unhex(123), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(rpad(unhex('12345'), 2))
-- !query schema
struct<hex(rpad(unhex(12345), 2, X'00')):string>
-- !query output
0123


-- !query
SELECT hex(rpad(unhex(''), 5, unhex('1f')))
-- !query schema
Expand Down Expand Up @@ -1340,6 +1372,22 @@ struct<to_binary(FF):binary>


-- !query
select to_binary('123', 'hex')
-- !query schema
struct<to_binary(123, hex):binary>
-- !query output
#


-- !query
select to_binary('12345', 'hex')
-- !query schema
struct<to_binary(12345, hex):binary>
-- !query output
#E


-- !query
select to_binary('GG')
-- !query schema
Expand Down Expand Up @@ -1421,7 +1469,8 @@ select to_binary('abc', 'Hex')
-- !query schema
struct<to_binary(abc, Hex):binary>
-- !query output



-- !query
Expand Down
Binary file not shown.

0 comments on commit 276abe3

Please sign in to comment.