diff --git a/ion/integration_test.go b/ion/integration_test.go index ceb7a6b7..60f506dd 100644 --- a/ion/integration_test.go +++ b/ion/integration_test.go @@ -194,7 +194,6 @@ var malformedIonsSkipList = []string{ "localSymbolTableWithMultipleSymbolsAndImportsFields.ion", "localSymbolTableWithMultipleSymbolsFields.10n", "localSymbolTableWithMultipleSymbolsFields.ion", - "longStringRawControlCharacter.ion", "minLongWithLenTooLarge.10n", "minLongWithLenTooSmall.10n", "negativeIntZero.10n", @@ -203,8 +202,6 @@ var malformedIonsSkipList = []string{ "nopPadWithAnnotations.10n", "nullDotCommentInt.ion", "sexpOperatorAnnotation.ion", - "stringLenTooLarge.10n", - "stringRawControlCharacter.ion", "stringWithLatinEncoding.10n", "structOrderedEmpty.10n", "surrogate_1.ion", @@ -225,7 +222,6 @@ var malformedIonsSkipList = []string{ var equivsSkipList = []string{ "annotatedIvms.ion", - "clobs.ion", "localSymbolTableAppend.ion", "localSymbolTableNullSlots.ion", "localSymbolTableWithAnnotations.ion", @@ -234,7 +230,6 @@ var equivsSkipList = []string{ "nonIVMNoOps.ion", "sexps.ion", "stringUtf8.ion", - "strings.ion", "structsFieldsDiffOrder.ion", "structsFieldsRepeatedNames.ion", "systemSymbols.ion", diff --git a/ion/tokenizer.go b/ion/tokenizer.go index 3413de7f..6b5888c4 100644 --- a/ion/tokenizer.go +++ b/ion/tokenizer.go @@ -542,11 +542,12 @@ func (t *tokenizer) readString() (string, error) { if err != nil { return "", err } - - switch c { - case -1, '\n': + // -1 denotes EOF, and new lines are not allowed in short string + if c == -1 || c == '\n' || isProhibitedControlChar(c) { return "", t.invalidChar(c) + } + switch c { case '"': return ret.String(), nil @@ -582,12 +583,14 @@ func (t *tokenizer) readLongString() (string, error) { if err != nil { return "", err } - - switch c { - case -1: + // -1 denotes EOF + if c == -1 || isProhibitedControlChar(c) { return "", t.invalidChar(c) + } + switch c { case '\'': + startPosition := t.pos ok, err := t.skipEndOfLongString(t.skipCommentsHandler) if err != nil { return "", err @@ -595,7 +598,10 @@ func (t *tokenizer) readLongString() (string, error) { if ok { return ret.String(), nil } - + if startPosition == t.pos { + // No character has been consumed. It is single '. + ret.WriteByte(byte(c)) + } case '\\': c, err = t.peek() if err != nil { @@ -1263,3 +1269,25 @@ func (t *tokenizer) unread(c int) { t.pos-- t.buffer = append(t.buffer, c) } + +func isProhibitedControlChar(c int) bool { + // Values between 0 to 31 are non-displayable ASCII characters; except for new line and white space characters. + if c < 0x00 || c > 0x1F { + return false + } + if isStringWhitespace(c) || isNewLineChar(c) { + return false + } + return true +} + +func isStringWhitespace(c int) bool { + return c == 0x09 || //horizontal tab + c == 0x0B || //vertical tab + c == 0x0C // form feed +} + +func isNewLineChar(c int) bool { + return c == 0x0A || //new line + c == 0x0D //carriage return +}