diff --git a/src/lazy/binary/immutable_buffer.rs b/src/lazy/binary/immutable_buffer.rs index 3778318d..59c2bb60 100644 --- a/src/lazy/binary/immutable_buffer.rs +++ b/src/lazy/binary/immutable_buffer.rs @@ -552,96 +552,142 @@ impl<'a> ImmutableBuffer<'a> { /// Reads a field ID and a value from the buffer. pub(crate) fn peek_field(self) -> IonResult>> { - self.peek_value(true) - } + let mut input = self; + if self.is_empty() { + // We're at the end of the struct + return Ok(None); + } + // Read the field ID + let (mut field_id_var_uint, mut input_after_field_id) = input.read_var_uint()?; + if input_after_field_id.is_empty() { + return IonResult::incomplete( + "found field name but no value", + input_after_field_id.offset(), + ); + } + + let mut type_descriptor = input_after_field_id.peek_type_descriptor()?; + if type_descriptor.is_nop() { + // Read past NOP fields until we find the first one that's an actual value + // or we run out of struct bytes. Note that we read the NOP field(s) from `self` (the + // initial input) rather than `input_after_field_id` because it simplifies + // the logic of `read_struct_field_nop_pad()`, which is very rarely called. + (field_id_var_uint, input_after_field_id) = match input.read_struct_field_nop_pad()? { + None => { + // There are no more fields, we're at the end of the struct. + return Ok(None); + } + Some((nop_length, field_id_var_uint, input_after_field_id)) => { + // Advance `input` beyond the NOP so that when we store it in the value it begins + // with the field ID. + input = input.consume(nop_length); + type_descriptor = input_after_field_id.peek_type_descriptor()?; + (field_id_var_uint, input_after_field_id) + } + }; + } + + let field_id_length = field_id_var_uint.size_in_bytes() as u8; + let field_id = field_id_var_uint.value(); - /// Reads a value from the buffer. - pub(crate) fn peek_value_without_field_id(self) -> IonResult>> { - self.peek_value(false) + let mut value = input_after_field_id.read_value(type_descriptor)?; + value.encoded_value.field_id = Some(field_id); + value.encoded_value.field_id_length = field_id_length; + value.encoded_value.total_length += field_id_length as usize; + value.input = input; + Ok(Some(value)) } - /// Reads a value from the buffer. If `has_field` is true, it will read a field ID first. - // This method consumes leading NOP bytes, but leaves the header representation in the buffer. - // The resulting LazyRawValue's buffer slice always starts with the first non-NOP byte in the - // header, which can be either a field ID, an annotations wrapper, or a type descriptor. - fn peek_value(self, has_field: bool) -> IonResult>> { - let initial_input = self; - if initial_input.is_empty() { - return Ok(None); - } - let (field_id, field_id_length, mut input) = if has_field { - let (field_id_var_uint, input_after_field_id) = initial_input.read_var_uint()?; + #[cold] + /// Consumes (field ID, NOP pad) pairs until a non-NOP value is encountered in field position or + /// the buffer is empty. Returns a buffer starting at the field ID before the non-NOP value. + fn read_struct_field_nop_pad(self) -> IonResult)>> { + let mut input_before_field_id = self; + loop { + if input_before_field_id.is_empty() { + return Ok(None); + } + let (field_id_var_uint, input_after_field_id) = + input_before_field_id.read_var_uint()?; + // If we're out of data (i.e. there's no field value) the struct is incomplete. if input_after_field_id.is_empty() { return IonResult::incomplete( - "found field name but no value", + "found a field name but no value", input_after_field_id.offset(), ); } - let field_id_length = - u8::try_from(field_id_var_uint.size_in_bytes()).map_err(|_| { - IonError::decoding_error("found a field id with length over 255 bytes") - })?; - ( - Some(field_id_var_uint.value()), - field_id_length, - input_after_field_id, - ) - } else { - (None, 0, initial_input) - }; - - let mut annotations_header_length = 0u8; - let mut annotations_sequence_length = 0u8; - let mut expected_value_length = None; + // Peek at the next value header. If it's a NOP, we need to repeat the process. + if input_after_field_id.peek_type_descriptor()?.is_nop() { + // Consume the NOP to position the buffer at the beginning of the next field ID. + (_, input_before_field_id) = input_after_field_id.read_nop_pad()?; + } else { + // If it isn't a NOP, return the field ID and the buffer slice containing the field + // value. + let nop_length = input_before_field_id.offset() - self.offset(); + return Ok(Some((nop_length, field_id_var_uint, input_after_field_id))); + } + } + } + /// Reads a value without a field name from the buffer. This is applicable in lists, s-expressions, + /// and at the top level. + pub(crate) fn peek_sequence_value(self) -> IonResult>> { + if self.is_empty() { + return Ok(None); + } + let mut input = self; let mut type_descriptor = input.peek_type_descriptor()?; - if type_descriptor.is_annotation_wrapper() { - let (wrapper, input_after_annotations) = - input.read_annotations_wrapper(type_descriptor)?; - annotations_header_length = wrapper.header_length; - annotations_sequence_length = wrapper.sequence_length; - expected_value_length = Some(wrapper.expected_value_length); - input = input_after_annotations; - type_descriptor = input.peek_type_descriptor()?; - if type_descriptor.is_annotation_wrapper() { - return IonResult::decoding_error("found an annotations wrapper "); + // If we find a NOP... + if type_descriptor.is_nop() { + // ...skip through NOPs until we found the next non-NOP byte. + (_, input) = self.consume_nop_padding(type_descriptor)?; + // If there is no next byte, we're out of values. + if input.is_empty() { + return Ok(None); } - } else if type_descriptor.is_nop() { - (_, input) = input.consume_nop_padding(type_descriptor)?; + // Otherwise, there's a value. + type_descriptor = input.peek_type_descriptor()?; + } + Ok(Some(input.read_value(type_descriptor)?)) + } + + /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that + /// the next byte (`type_descriptor`) is not a NOP. + fn read_value(self, type_descriptor: TypeDescriptor) -> IonResult> { + if type_descriptor.is_annotation_wrapper() { + self.read_annotated_value(type_descriptor) + } else { + self.read_value_without_annotations(type_descriptor) } + } + /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that + /// the next byte (`type_descriptor`) is neither a NOP nor an annotations wrapper. + fn read_value_without_annotations( + self, + type_descriptor: TypeDescriptor, + ) -> IonResult> { + let input = self; let header = type_descriptor .to_header() .ok_or_else(|| IonError::decoding_error("found a non-value in value position"))?; let header_offset = input.offset(); let (length, _) = input.consume(1).read_value_length(header)?; - let length_length = u8::try_from(length.size_in_bytes()).map_err(|_e| { - IonError::decoding_error("found a value with a header length field over 255 bytes long") - })?; + let length_length = length.size_in_bytes() as u8; let value_length = length.value(); // ha - let total_length = field_id_length as usize - + annotations_header_length as usize - + 1 // Header byte - + length_length as usize - + value_length; - - if let Some(expected_value_length) = expected_value_length { - let actual_value_length = 1 + length_length as usize + value_length; - if expected_value_length != actual_value_length { - println!("{} != {}", expected_value_length, actual_value_length); - return IonResult::decoding_error( - "value length did not match length declared by annotations wrapper", - ); - } - } + let total_length = 1 // Header byte + + length_length as usize + + value_length; let encoded_value = EncodedValue { header, - field_id_length, - field_id, - annotations_header_length, - annotations_sequence_length, + // If applicable, these are populated by the caller: `peek_field()` + field_id_length: 0, + field_id: None, + // If applicable, these are populated by the caller: `read_annotated_value()` + annotations_header_length: 0, + annotations_sequence_length: 0, header_offset, length_length, value_length, @@ -649,9 +695,46 @@ impl<'a> ImmutableBuffer<'a> { }; let lazy_value = LazyRawBinaryValue { encoded_value, - input: initial_input, + // If this value has a field ID or annotations, this will be replaced by the caller. + input: self, }; - Ok(Some(lazy_value)) + Ok(lazy_value) + } + + /// Reads an annotations wrapper and its associated value from the buffer. The caller must confirm + /// that the next byte in the buffer (`type_descriptor`) begins an annotations wrapper. + fn read_annotated_value( + self, + mut type_descriptor: TypeDescriptor, + ) -> IonResult> { + let input = self; + let (wrapper, input_after_annotations) = input.read_annotations_wrapper(type_descriptor)?; + type_descriptor = input_after_annotations.peek_type_descriptor()?; + + // Confirm that the next byte begins a value, not a NOP or another annotations wrapper. + if type_descriptor.is_annotation_wrapper() { + return IonResult::decoding_error( + "found an annotations wrapper inside an annotations wrapper", + ); + } else if type_descriptor.is_nop() { + return IonResult::decoding_error("found a NOP inside an annotations wrapper"); + } + + let mut lazy_value = + input_after_annotations.read_value_without_annotations(type_descriptor)?; + if wrapper.expected_value_length != lazy_value.encoded_value.total_length() { + return IonResult::decoding_error( + "value length did not match length declared by annotations wrapper", + ); + } + + lazy_value.encoded_value.annotations_header_length = wrapper.header_length; + lazy_value.encoded_value.annotations_sequence_length = wrapper.sequence_length; + lazy_value.encoded_value.total_length += wrapper.header_length as usize; + // Modify the input to include the annotations + lazy_value.input = input; + + Ok(lazy_value) } } diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index 99c28d2f..f56c65ce 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -49,7 +49,7 @@ impl<'data> LazyRawBinaryReader<'data> { &mut self, buffer: ImmutableBuffer<'data>, ) -> IonResult> { - let lazy_value = match ImmutableBuffer::peek_value_without_field_id(buffer)? { + let lazy_value = match ImmutableBuffer::peek_sequence_value(buffer)? { Some(lazy_value) => lazy_value, None => return Ok(RawStreamItem::EndOfStream), }; @@ -142,7 +142,9 @@ impl<'data> DataSource<'data> { Err(e) => return Err(e), }; - self.buffer = buffer; + // If the value we read doesn't start where we began reading, there was a NOP. + let num_nop_bytes = lazy_value.input.offset() - buffer.offset(); + self.buffer = buffer.consume(num_nop_bytes); self.bytes_to_skip = lazy_value.encoded_value.total_length(); Ok(Some(lazy_value)) } diff --git a/src/lazy/binary/raw/sequence.rs b/src/lazy/binary/raw/sequence.rs index 7bb86d45..0014e288 100644 --- a/src/lazy/binary/raw/sequence.rs +++ b/src/lazy/binary/raw/sequence.rs @@ -157,7 +157,7 @@ impl<'data> Iterator for RawBinarySequenceIterator<'data> { fn next(&mut self) -> Option { self.source - .try_parse_next(ImmutableBuffer::peek_value_without_field_id) + .try_parse_next(ImmutableBuffer::peek_sequence_value) .transpose() } } diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 245f4706..5bdc8641 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -257,8 +257,8 @@ impl<'data> LazyRawBinaryValue<'data> { return Ok(RawValueRef::Decimal(Decimal::new(0i32, 0i64))); } - // Skip the type descriptor - let input = self.input.consume(1); + // Skip the type descriptor and length bytes + let input = ImmutableBuffer::new(self.value_body()?); let (exponent_var_int, remaining) = input.read_var_int()?; let coefficient_size_in_bytes = diff --git a/src/lazy/reader.rs b/src/lazy/reader.rs index 2f3cfbb4..a8624193 100644 --- a/src/lazy/reader.rs +++ b/src/lazy/reader.rs @@ -1,16 +1,17 @@ use crate::binary::constants::v1_0::IVM; use crate::element::reader::ElementReader; use crate::element::Element; +use crate::lazy::any_encoding::AnyEncoding; use crate::lazy::decoder::LazyDecoder; -use crate::lazy::encoding::BinaryEncoding; -use crate::lazy::system_reader::LazySystemReader; +use crate::lazy::encoding::{BinaryEncoding, TextEncoding}; +use crate::lazy::system_reader::{LazySystemAnyReader, LazySystemBinaryReader, LazySystemReader}; use crate::lazy::value::LazyValue; use crate::result::IonFailure; use crate::{IonError, IonResult}; /// A binary reader that only reads each value that it visits upon request (that is: lazily). /// -/// Each time [`LazyReader::next`] is called, the reader will advance to the next top-level value +/// Each time [`LazyApplicationReader::next`] is called, the reader will advance to the next top-level value /// in the input stream. Once positioned on a top-level value, users may visit nested values by /// calling [`LazyValue::read`] and working with the resulting [`crate::lazy::value_ref::ValueRef`], /// which may contain either a scalar value or a lazy container that may itself be traversed. @@ -18,7 +19,7 @@ use crate::{IonError, IonResult}; /// The values that the reader yields ([`LazyValue`], /// [`LazyBinarySequence`](crate::lazy::sequence::LazyBinarySequence), and /// [`LazyBinaryStruct`](crate::lazy::struct::LazyStruct)) are -/// immutable references to the data stream, and remain valid until [`LazyReader::next`] is called +/// immutable references to the data stream, and remain valid until [`LazyApplicationReader::next`] is called /// again to advance the reader to the next top level value. This means that these references can /// be stored, read, and re-read as long as the reader remains on the same top-level value. /// ``` @@ -55,11 +56,11 @@ use crate::{IonError, IonResult}; ///# Ok(()) ///# } /// ``` -pub struct LazyReader<'data, D: LazyDecoder<'data>> { +pub struct LazyApplicationReader<'data, D: LazyDecoder<'data>> { system_reader: LazySystemReader<'data, D>, } -impl<'data, D: LazyDecoder<'data>> LazyReader<'data, D> { +impl<'data, D: LazyDecoder<'data>> LazyApplicationReader<'data, D> { /// Returns the next top-level value in the input stream as `Ok(Some(lazy_value))`. /// If there are no more top-level values in the stream, returns `Ok(None)`. /// If the next value is incomplete (that is: only part of it is in the input buffer) or if the @@ -75,7 +76,16 @@ impl<'data, D: LazyDecoder<'data>> LazyReader<'data, D> { } } -pub type LazyBinaryReader<'data> = LazyReader<'data, BinaryEncoding>; +pub type LazyBinaryReader<'data> = LazyApplicationReader<'data, BinaryEncoding>; +pub type LazyTextReader<'data> = LazyApplicationReader<'data, TextEncoding>; +pub type LazyReader<'data> = LazyApplicationReader<'data, AnyEncoding>; + +impl<'data> LazyReader<'data> { + pub fn new(ion_data: &'data [u8]) -> LazyReader<'data> { + let system_reader = LazySystemAnyReader::new(ion_data); + LazyApplicationReader { system_reader } + } +} impl<'data> LazyBinaryReader<'data> { pub fn new(ion_data: &'data [u8]) -> IonResult> { @@ -85,13 +95,13 @@ impl<'data> LazyBinaryReader<'data> { return IonResult::decoding_error("input does not begin with an Ion version marker"); } - let system_reader = LazySystemReader::new(ion_data); - Ok(LazyReader { system_reader }) + let system_reader = LazySystemBinaryReader::new(ion_data); + Ok(LazyApplicationReader { system_reader }) } } pub struct LazyElementIterator<'iter, 'data, D: LazyDecoder<'data>> { - lazy_reader: &'iter mut LazyReader<'data, D>, + lazy_reader: &'iter mut LazyApplicationReader<'data, D>, } impl<'iter, 'data, D: LazyDecoder<'data>> Iterator for LazyElementIterator<'iter, 'data, D> { @@ -106,7 +116,7 @@ impl<'iter, 'data, D: LazyDecoder<'data>> Iterator for LazyElementIterator<'iter } } -impl<'data, D: LazyDecoder<'data>> ElementReader for LazyReader<'data, D> { +impl<'data, D: LazyDecoder<'data>> ElementReader for LazyApplicationReader<'data, D> { type ElementIterator<'a> = LazyElementIterator<'a, 'data, D> where Self: 'a,; fn read_next_element(&mut self) -> IonResult> { diff --git a/src/lazy/str_ref.rs b/src/lazy/str_ref.rs index 850d0556..d8afc4df 100644 --- a/src/lazy/str_ref.rs +++ b/src/lazy/str_ref.rs @@ -1,6 +1,6 @@ use crate::lazy::bytes_ref::BytesRef; use crate::text::text_formatter::IonValueFormatter; -use crate::Str; +use crate::{RawSymbolTokenRef, Str}; use std::borrow::Cow; use std::fmt::{Display, Formatter}; use std::ops::Deref; @@ -90,3 +90,9 @@ impl<'data> From> for BytesRef<'data> { } } } + +impl<'data> From> for RawSymbolTokenRef<'data> { + fn from(value: StrRef<'data>) -> Self { + RawSymbolTokenRef::Text(value.text) + } +} diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index 4956728f..b02f86bb 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -185,7 +185,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyStruct<'top, 'data, D> { /// let lazy_struct = reader.expect_next()?.read()?.expect_struct()?; /// /// assert_eq!(lazy_struct.get_expected("foo")?, ValueRef::String("hello".into())); - /// assert!(dbg!(lazy_struct.get_expected("Ontario")).is_err()); + /// assert!(lazy_struct.get_expected("Ontario").is_err()); ///# Ok(()) ///# } /// ``` diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index e36e4e38..5f5c3837 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -1,4 +1,5 @@ -use crate::lazy::encoding::BinaryEncoding; +use crate::lazy::any_encoding::{AnyEncoding, LazyRawAnyReader}; +use crate::lazy::encoding::{BinaryEncoding, TextEncoding}; use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef, SymbolTable}; @@ -21,7 +22,7 @@ const SYMBOLS: RawSymbolTokenRef = RawSymbolTokenRef::SymbolId(7); /// A binary reader that only reads each value that it visits upon request (that is: lazily). /// -/// Unlike [`crate::lazy::reader::LazyReader`], which only exposes values that are part +/// Unlike [`crate::lazy::reader::LazyApplicationReader`], which only exposes values that are part /// of the application data model, [`LazySystemReader`] also yields Ion version markers /// (as [`SystemStreamItem::VersionMarker`]) and structs representing a symbol table (as /// [`SystemStreamItem::SymbolTable`]). @@ -76,7 +77,9 @@ pub struct LazySystemReader<'data, D: LazyDecoder<'data>> { pending_lst: PendingLst, } -pub type LazyBinarySystemReader<'data> = LazySystemReader<'data, BinaryEncoding>; +pub type LazySystemBinaryReader<'data> = LazySystemReader<'data, BinaryEncoding>; +pub type LazySystemTextReader<'data> = LazySystemReader<'data, TextEncoding>; +pub type LazySystemAnyReader<'data> = LazySystemReader<'data, AnyEncoding>; // If the reader encounters a symbol table in the stream, it will store all of the symbols that // the table defines in this structure so that they may be applied when the reader next advances. @@ -85,8 +88,22 @@ struct PendingLst { symbols: Vec>, } -impl<'data> LazyBinarySystemReader<'data> { - pub(crate) fn new(ion_data: &'data [u8]) -> LazyBinarySystemReader<'data> { +impl<'data> LazySystemAnyReader<'data> { + pub fn new(ion_data: &'data [u8]) -> LazySystemAnyReader<'data> { + let raw_reader = LazyRawAnyReader::new(ion_data); + LazySystemReader { + raw_reader, + symbol_table: SymbolTable::new(), + pending_lst: PendingLst { + is_lst_append: false, + symbols: Vec::new(), + }, + } + } +} + +impl<'data> LazySystemBinaryReader<'data> { + pub(crate) fn new(ion_data: &'data [u8]) -> LazySystemBinaryReader<'data> { let raw_reader = LazyRawBinaryReader::new(ion_data); LazySystemReader { raw_reader, @@ -107,7 +124,7 @@ impl<'data, D: LazyDecoder<'data>> LazySystemReader<'data, D> { return Ok(false); } if let Some(symbol_ref) = lazy_value.annotations().next() { - return Ok(symbol_ref? == ION_SYMBOL_TABLE); + return Ok(symbol_ref?.matches_sid_or_text(3, "$ion_symbol_table")); }; Ok(false) } @@ -208,7 +225,7 @@ impl<'data, D: LazyDecoder<'data>> LazySystemReader<'data, D> { for field_result in symbol_table.iter() { let field = field_result?; - if field.name() == SYMBOLS { + if field.name().matches_sid_or_text(7, "symbols") { if found_symbols_field { return IonResult::decoding_error( "found symbol table with multiple 'symbols' fields", @@ -217,7 +234,7 @@ impl<'data, D: LazyDecoder<'data>> LazySystemReader<'data, D> { found_symbols_field = true; Self::process_symbols(pending_lst, &field.value())?; } - if field.name() == IMPORTS { + if field.name().matches_sid_or_text(6, "imports") { if found_imports_field { return IonResult::decoding_error( "found symbol table with multiple 'imports' fields", @@ -250,7 +267,7 @@ impl<'data, D: LazyDecoder<'data>> LazySystemReader<'data, D> { fn process_imports(pending_lst: &mut PendingLst, imports: &D::Value) -> IonResult<()> { match imports.read()? { RawValueRef::Symbol(symbol_ref) => { - if symbol_ref == RawSymbolTokenRef::SymbolId(3) { + if symbol_ref.matches_sid_or_text(3, "$ion_symbol_table") { pending_lst.is_lst_append = true; } // Any other symbol is ignored @@ -291,7 +308,7 @@ mod tests { hello "#, )?; - let mut system_reader = LazySystemReader::new(&ion_data); + let mut system_reader = LazySystemBinaryReader::new(&ion_data); loop { match system_reader.next_item()? { SystemStreamItem::VersionMarker(major, minor) => { @@ -316,7 +333,7 @@ mod tests { ) "#, )?; - let mut system_reader = LazySystemReader::new(&ion_data); + let mut system_reader = LazySystemBinaryReader::new(&ion_data); loop { match system_reader.next_item()? { SystemStreamItem::Value(value) => { @@ -343,7 +360,7 @@ mod tests { } "#, )?; - let mut system_reader = LazySystemReader::new(&ion_data); + let mut system_reader = LazySystemBinaryReader::new(&ion_data); loop { match system_reader.next_item()? { SystemStreamItem::Value(value) => { diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 00306eef..f0bb3f6d 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -5,20 +5,28 @@ use std::slice::Iter; use std::str::FromStr; use nom::branch::alt; -use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n}; +use nom::bytes::complete::{ + is_a as complete_is_a, is_not as complete_is_not, tag as complete_tag, + take_while as complete_take_while, take_while1 as complete_take_while1, +}; +use nom::bytes::streaming::{is_a, tag, take_until, take_while_m_n}; +use nom::character::complete::{ + char as complete_char, digit1 as complete_digit1, one_of as complete_one_of, +}; use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy}; -use nom::combinator::{consumed, map, not, opt, peek, recognize, success, value}; +use nom::combinator::{consumed, eof, map, not, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; -use nom::multi::{fold_many1, many0_count, many1_count}; +use nom::multi::{fold_many1, fold_many_m_n, many0_count, many1_count}; use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; -use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; +use nom::{AsBytes, CompareResult, IResult, InputLength, InputTake, Needed, Parser}; use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedBlob, MatchedClob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, - MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, + MatchedBlob, MatchedClob, MatchedDecimal, MatchedFieldName, MatchedFloat, + MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol, MatchedTimestamp, + MatchedTimestampOffset, MatchedValue, }; use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; @@ -67,7 +75,23 @@ const WHITESPACE_CHARACTERS: &[char] = &[ ]; /// Same as [WHITESPACE_CHARACTERS], but formatted as a string for use in some `nom` APIs -const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; +pub(crate) const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; + +/// This helper function takes a parser and returns a closure that performs the same parsing +/// but prints the Result before returning the output. This is handy for debugging. +// A better implementation would use a macro to auto-generate the label from the file name and +// line number. +fn dbg_parse>( + label: &'static str, + mut parser: P, +) -> impl Parser { + move |input: I| { + let result = parser.parse(input); + #[cfg(debug_assertions)] + println!("{}: {:?}", label, result); + result + } +} /// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing /// the various encoding elements of a text Ion stream. @@ -165,7 +189,7 @@ impl<'data> TextBufferView<'data> { } pub fn match_whitespace(self) -> IonMatchResult<'data> { - is_a(WHITESPACE_CHARACTERS_AS_STR)(self) + complete_is_a(WHITESPACE_CHARACTERS_AS_STR)(self) } /// Always succeeds and consumes none of the input. Returns an empty slice of the buffer. @@ -209,13 +233,13 @@ impl<'data> TextBufferView<'data> { fn match_rest_of_line_comment(self) -> IonMatchResult<'data> { preceded( // Matches a leading "//"... - tag("//"), + complete_tag("//"), // ...followed by either... alt(( // ...one or more non-EOL characters... - is_not("\r\n"), + complete_is_not("\r\n"), // ...or any EOL character. - peek(recognize(one_of("\r\n"))), + peek(recognize(complete_one_of("\r\n"))), // In either case, the line ending will not be consumed. )), )(self) @@ -225,18 +249,25 @@ impl<'data> TextBufferView<'data> { fn match_multiline_comment(self) -> IonMatchResult<'data> { recognize(delimited( // Matches a leading "/*"... - tag("/*"), + complete_tag("/*"), // ...any number of non-"*/" characters... take_until("*/"), // ...and then a closing "*/" - tag("*/"), + complete_tag("*/"), ))(self) } /// Matches an Ion version marker (e.g. `$ion_1_0` or `$ion_1_1`.) pub fn match_ivm(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { - let (remaining, (major, minor)) = - preceded(tag("$ion_"), separated_pair(digit1, tag("_"), digit1))(self)?; + let (remaining, (major, minor)) = terminated( + preceded( + complete_tag("$ion_"), + separated_pair(complete_digit1, complete_tag("_"), complete_digit1), + ), + // Look ahead to make sure the IVM isn't followed by a '::'. If it is, then it's not + // an IVM, it's an annotation. + peek(whitespace_and_then(not(complete_tag("::")))), + )(self)?; // `major` and `minor` are base 10 digits. Turning them into `&str`s is guaranteed to succeed. let major_version = u8::from_str(major.as_text().unwrap()).map_err(|_| { let error = InvalidInputError::new(major) @@ -266,7 +297,7 @@ impl<'data> TextBufferView<'data> { pub fn match_annotation(self) -> IonParseResult<'data, (MatchedSymbol, Range)> { terminated( whitespace_and_then(match_and_span(Self::match_symbol)), - whitespace_and_then(tag("::")), + whitespace_and_then(complete_tag("::")), )(self) } @@ -280,7 +311,7 @@ impl<'data> TextBufferView<'data> { // int `3` while recognizing the input `-3` as the int `-3`. If `match_operator` runs before // `match_value`, it will consume the sign (`-`) of negative number values, treating // `-3` as an operator (`-`) and an int (`3`). Thus, we run `match_value` first. - alt((Self::match_value, Self::match_operator)), + whitespace_and_then(alt((Self::match_value, Self::match_operator))), ) .map(|(maybe_annotations, mut value)| { if let Some(annotations) = maybe_annotations { @@ -296,28 +327,6 @@ impl<'data> TextBufferView<'data> { .parse(self) } - /// Matches a single value in a list OR the end of the list, allowing for leading whitespace - /// and comments in either case. - /// - /// If a value is found, returns `Ok(Some(value))`. If the end of the list is found, returns - /// `Ok(None)`. - pub fn match_list_value(self) -> IonParseResult<'data, Option>> { - preceded( - // Some amount of whitespace/comments... - Self::match_optional_comments_and_whitespace, - // ...followed by either the end of the list... - alt(( - value(None, tag("]")), - // ...or a value... - terminated( - Self::match_annotated_value.map(Some), - // ...followed by a comma or end-of-list - Self::match_delimiter_after_list_value, - ), - )), - )(self) - } - /// Matches a struct field name/value pair. /// /// If a pair is found, returns `Some(field)` and consumes the following comma if present. @@ -357,7 +366,7 @@ impl<'data> TextBufferView<'data> { /// input bytes where the field name is found, and the value. pub fn match_struct_field_name_and_value( self, - ) -> IonParseResult<'data, ((MatchedSymbol, Range), LazyRawTextValue<'data>)> { + ) -> IonParseResult<'data, ((MatchedFieldName, Range), LazyRawTextValue<'data>)> { terminated( separated_pair( whitespace_and_then(match_and_span(Self::match_struct_field_name)), @@ -392,32 +401,13 @@ impl<'data> TextBufferView<'data> { /// * An identifier /// * A symbol ID /// * A short-form string - pub fn match_struct_field_name(self) -> IonParseResult<'data, MatchedSymbol> { + pub fn match_struct_field_name(self) -> IonParseResult<'data, MatchedFieldName> { alt(( - Self::match_symbol, - Self::match_short_string.map(|s| { - // NOTE: We're "casting" the matched short string to a matched symbol here. - // This relies on the fact that the MatchedSymbol logic ignores - // the first and last matched byte, which are usually single - // quotes but in this case are double quotes. - match s { - MatchedString::ShortWithoutEscapes => MatchedSymbol::QuotedWithoutEscapes, - MatchedString::ShortWithEscapes => MatchedSymbol::QuotedWithEscapes, - _ => unreachable!("field name parser matched long string"), - } - }), + Self::match_string.map(MatchedFieldName::String), + Self::match_symbol.map(MatchedFieldName::Symbol), ))(self) } - /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or - /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). - fn match_delimiter_after_list_value(self) -> IonMatchResult<'data> { - preceded( - Self::match_optional_comments_and_whitespace, - alt((tag(","), peek(tag("]")))), - )(self) - } - /// Matches a single top-level value, an IVM, or the end of the stream. pub fn match_top_level_item(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { // If only whitespace/comments remain, we're at the end of the stream. @@ -575,6 +565,37 @@ impl<'data> TextBufferView<'data> { Ok((remaining, matched)) } + /// Matches a single value in a list OR the end of the list, allowing for leading whitespace + /// and comments in either case. + /// + /// If a value is found, returns `Ok(Some(value))`. If the end of the list is found, returns + /// `Ok(None)`. + pub fn match_list_value(self) -> IonParseResult<'data, Option>> { + preceded( + // Some amount of whitespace/comments... + Self::match_optional_comments_and_whitespace, + // ...followed by either the end of the list... + alt(( + value(None, tag("]")), + // ...or a value... + terminated( + Self::match_annotated_value.map(Some), + // ...followed by a comma or end-of-list + Self::match_delimiter_after_list_value, + ), + )), + )(self) + } + + /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or + /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). + fn match_delimiter_after_list_value(self) -> IonMatchResult<'data> { + preceded( + Self::match_optional_comments_and_whitespace, + alt((tag(","), peek(tag("]")))), + )(self) + } + /// Matches an s-expression (sexp). /// /// If the input does not contain the entire s-expression, returns `IonError::Incomplete(_)`. @@ -662,8 +683,8 @@ impl<'data> TextBufferView<'data> { /// Matches and returns a null value. pub fn read_null(self) -> IonParseResult<'data, IonType> { delimited( - tag("null"), - opt(preceded(char('.'), Self::read_ion_type)), + complete_tag("null"), + opt(preceded(complete_char('.'), Self::read_ion_type)), Self::peek_stop_character, ) .map(|explicit_ion_type| explicit_ion_type.unwrap_or(IonType::Null)) @@ -673,25 +694,25 @@ impl<'data> TextBufferView<'data> { /// Matches and returns an Ion type. fn read_ion_type(self) -> IonParseResult<'data, IonType> { alt(( - value(IonType::Null, tag("null")), - value(IonType::Bool, tag("bool")), - value(IonType::Int, tag("int")), - value(IonType::Float, tag("float")), - value(IonType::Decimal, tag("decimal")), - value(IonType::Timestamp, tag("timestamp")), - value(IonType::Symbol, tag("symbol")), - value(IonType::String, tag("string")), - value(IonType::Clob, tag("clob")), - value(IonType::Blob, tag("blob")), - value(IonType::List, tag("list")), - value(IonType::SExp, tag("sexp")), - value(IonType::Struct, tag("struct")), + value(IonType::Null, complete_tag("null")), + value(IonType::Bool, complete_tag("bool")), + value(IonType::Int, complete_tag("int")), + value(IonType::Float, complete_tag("float")), + value(IonType::Decimal, complete_tag("decimal")), + value(IonType::Timestamp, complete_tag("timestamp")), + value(IonType::Symbol, complete_tag("symbol")), + value(IonType::String, complete_tag("string")), + value(IonType::Clob, complete_tag("clob")), + value(IonType::Blob, complete_tag("blob")), + value(IonType::List, complete_tag("list")), + value(IonType::SExp, complete_tag("sexp")), + value(IonType::Struct, complete_tag("struct")), ))(self) } /// Matches any one of Ion's stop characters. fn match_stop_character(self) -> IonMatchResult<'data> { - recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}")).parse(self) + alt((eof, recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}"))))(self) } /// Matches--but does not consume--any one of Ion's stop characters. @@ -701,7 +722,7 @@ impl<'data> TextBufferView<'data> { /// Matches the three parts of an int--its base, its sign, and its digits--without actually /// constructing an Int from them. - fn match_int(self) -> IonParseResult<'data, MatchedInt> { + pub fn match_int(self) -> IonParseResult<'data, MatchedInt> { terminated( // We test for base 16 and base 2 so the '0x' or '0b' isn't confused for a leading zero // in a base 10 number, which would be illegal. @@ -719,7 +740,7 @@ impl<'data> TextBufferView<'data> { fn match_base_2_int(self) -> IonParseResult<'data, MatchedInt> { separated_pair( opt(char('-')), - alt((tag("0b"), tag("0B"))), + alt((complete_tag("0b"), complete_tag("0B"))), Self::match_base_2_int_digits, ) .map(|(maybe_sign, digits)| { @@ -732,9 +753,9 @@ impl<'data> TextBufferView<'data> { fn match_base_2_int_digits(self) -> IonMatchResult<'data> { recognize(terminated( // Zero or more digits-followed-by-underscores - many0_count(pair(is_a("01"), char('_'))), + many0_count(pair(complete_is_a("01"), complete_tag("_"))), // One or more digits - is_a("01"), + complete_is_a("01"), ))(self) } @@ -750,12 +771,7 @@ impl<'data> TextBufferView<'data> { /// Matches the digits of a base-10 integer. (i.e. An integer without a sign.) fn match_base_10_int_digits(self) -> IonMatchResult<'data> { - alt(( - // The number is either a zero... - recognize(char('0')), - // Or it's a non-zero followed by some number of '_'-separated digits - Self::match_base_10_digits_before_dot, - ))(self) + Self::match_base_10_digits_before_dot(self) } /// Matches either: @@ -763,7 +779,9 @@ impl<'data> TextBufferView<'data> { /// * a non-zero followed by some number of digits with optional underscores fn match_base_10_digits_before_dot(self) -> IonMatchResult<'data> { alt(( - tag("0"), + // The number is either a zero... + complete_tag("0"), + // Or it's a non-zero followed by some number of '_'-separated digits recognize(pair( Self::match_base_10_leading_digit, Self::match_base_10_trailing_digits, @@ -780,7 +798,7 @@ impl<'data> TextBufferView<'data> { /// This parser accepts leading zeros, which is why it cannot be used for the beginning /// of a number. fn match_base_10_trailing_digits(self) -> IonMatchResult<'data> { - recognize(many0_count(pair(opt(char('_')), digit1)))(self) + recognize(many0_count(pair(opt(complete_char('_')), complete_digit1)))(self) } /// Matches a base-10 notation integer (e.g. `0x0`, `0X20`, or `-0xCAFE`) and returns the @@ -788,7 +806,7 @@ impl<'data> TextBufferView<'data> { fn match_base_16_int(self) -> IonParseResult<'data, MatchedInt> { separated_pair( opt(char('-')), - alt((tag("0x"), tag("0X"))), + alt((complete_tag("0x"), complete_tag("0X"))), Self::match_base_16_int_trailing_digits, ) .map(|(maybe_sign, digits)| { @@ -801,7 +819,7 @@ impl<'data> TextBufferView<'data> { fn match_base_16_int_trailing_digits(self) -> IonMatchResult<'data> { recognize(terminated( // Zero or more digits-followed-by-underscores - many0_count(pair(Self::take_base_16_digits1, char('_'))), + many0_count(pair(Self::take_base_16_digits1, complete_tag("_"))), // One or more digits Self::take_base_16_digits1, ))(self) @@ -810,35 +828,53 @@ impl<'data> TextBufferView<'data> { /// Recognizes 1 or more consecutive base-16 digits. // This function's "1" suffix is a style borrowed from `nom`. fn take_base_16_digits1(self) -> IonMatchResult<'data> { - take_while1(|b: u8| b.is_ascii_hexdigit())(self) + complete_take_while1(|b: u8| b.is_ascii_hexdigit())(self) + } + + /// Matches `n` consecutive hex digits. + pub(crate) fn match_n_hex_digits( + count: usize, + ) -> impl Parser, TextBufferView<'data>, IonParseError<'data>> { + // `fold_many_m_n` allows us to repeat the same parser between 'm' and 'n' times, + // specifying an operation to perform on each match. In our case, we just need the parser + // to run 'n' times exactly so `recognize` can return the accepted slice; our operation + // is a no-op. + recognize(fold_many_m_n( + count, + count, + satisfy(|c| c.is_ascii_hexdigit()), + || 0, + // no-op + |accum, _item| accum, + )) } /// Matches an Ion float of any syntax fn match_float(self) -> IonParseResult<'data, MatchedFloat> { - alt(( - Self::match_float_special_value, - Self::match_float_numeric_value, - ))(self) + terminated( + alt(( + Self::match_float_special_value, + Self::match_float_numeric_value, + )), + Self::peek_stop_character, + )(self) } /// Matches special IEEE-754 values, including +/- infinity and NaN. fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> { alt(( - value(MatchedFloat::NotANumber, tag("nan")), - value(MatchedFloat::PositiveInfinity, tag("+inf")), - value(MatchedFloat::NegativeInfinity, tag("-inf")), + value(MatchedFloat::NotANumber, complete_tag("nan")), + value(MatchedFloat::PositiveInfinity, complete_tag("+inf")), + value(MatchedFloat::NegativeInfinity, complete_tag("-inf")), ))(self) } /// Matches numeric IEEE-754 floating point values. fn match_float_numeric_value(self) -> IonParseResult<'data, MatchedFloat> { - terminated( - recognize(pair( - Self::match_number_with_optional_dot_and_digits, - Self::match_float_exponent_marker_and_digits, - )), - Self::peek_stop_character, - ) + recognize(pair( + Self::match_number_with_optional_dot_and_digits, + Self::match_float_exponent_marker_and_digits, + )) .map(|_matched| MatchedFloat::Numeric) .parse(self) } @@ -851,7 +887,7 @@ impl<'data> TextBufferView<'data> { /// -25.2 fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'data> { recognize(tuple(( - opt(tag("-")), + opt(complete_tag("-")), Self::match_base_10_digits_before_dot, opt(Self::match_dot_followed_by_base_10_digits), )))(self) @@ -861,7 +897,7 @@ impl<'data> TextBufferView<'data> { /// This includes either a single zero, or a non-zero followed by any sequence of digits. fn match_digits_before_dot(self) -> IonMatchResult<'data> { alt(( - tag("0"), + complete_tag("0"), recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)), ))(self) } @@ -873,21 +909,27 @@ impl<'data> TextBufferView<'data> { /// Matches any number of base 10 digits, allowing underscores at any position except the end. fn match_trailing_digits(self) -> IonMatchResult<'data> { - recognize(many0_count(preceded(opt(char('_')), digit1)))(self) + recognize(many0_count(preceded( + opt(complete_char('_')), + complete_digit1, + )))(self) } /// Recognizes a decimal point followed by any number of base-10 digits. fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'data> { - recognize(preceded(tag("."), opt(Self::match_digits_after_dot)))(self) + recognize(preceded( + complete_tag("."), + opt(Self::match_digits_after_dot), + ))(self) } /// Like `match_digits_before_dot`, but allows leading zeros. fn match_digits_after_dot(self) -> IonMatchResult<'data> { recognize(terminated( // Zero or more digits-followed-by-underscores - many0_count(pair(digit1, char('_'))), + many0_count(pair(complete_digit1, complete_char('_'))), // One or more digits - digit1, + complete_digit1, ))(self) } @@ -895,7 +937,7 @@ impl<'data> TextBufferView<'data> { /// base 10 digits. fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> { preceded( - one_of("eE"), + complete_one_of("eE"), recognize(Self::match_exponent_sign_and_digits), )(self) } @@ -921,45 +963,50 @@ impl<'data> TextBufferView<'data> { /// /// This is used for matching exponent signs; most places in Ion do not allow `+`. pub fn match_any_sign(self) -> IonParseResult<'data, char> { - one_of("-+")(self) + complete_one_of("-+")(self) } pub fn match_decimal_exponent(self) -> IonParseResult<'data, (bool, TextBufferView<'data>)> { - preceded(one_of("dD"), Self::match_exponent_sign_and_digits)(self) + preceded(complete_one_of("dD"), Self::match_exponent_sign_and_digits)(self) } /// Match an optional sign (if present), digits before the decimal point, then digits after the /// decimal point (if present). pub fn match_decimal(self) -> IonParseResult<'data, MatchedDecimal> { - tuple(( - opt(tag("-")), - Self::match_digits_before_dot, - alt(( - // Either a decimal point and digits and optional d/D and exponent - preceded( - tag("."), - pair( - alt((Self::match_digits_after_dot, Self::match_nothing)), + terminated( + tuple(( + opt(complete_tag("-")), + Self::match_digits_before_dot, + alt(( + // Either a decimal point and digits and optional d/D and exponent + tuple(( + complete_tag("."), + opt(Self::match_digits_after_dot), opt(Self::match_decimal_exponent), - ), - ) - .map(|(digits_after_dot, maybe_exponent)| { - let (exp_is_negative, exp_digits) = match maybe_exponent { - Some(exponent) => exponent, - None => (false, digits_after_dot.slice(digits_after_dot.len(), 0)), - }; - (digits_after_dot, exp_is_negative, exp_digits) - }), - // or just a d/D and exponent - consumed(Self::match_decimal_exponent).map( - |(matched, (exp_is_negative, exp_digits))| { - // Make an empty slice to represent the (absent) digits after dot - let digits_after_dot = matched.slice(0, 0); + )) + .map(|(dot, maybe_digits_after_dot, maybe_exponent)| { + let digits_after_dot = match maybe_digits_after_dot { + Some(digits) => digits, + None => dot.slice(1, 0), + }; + let (exp_is_negative, exp_digits) = match maybe_exponent { + Some(exponent) => exponent, + None => (false, digits_after_dot.slice(digits_after_dot.len(), 0)), + }; (digits_after_dot, exp_is_negative, exp_digits) - }, - ), + }), + // or just a d/D and exponent + consumed(Self::match_decimal_exponent).map( + |(matched, (exp_is_negative, exp_digits))| { + // Make an empty slice to represent the (absent) digits after dot + let digits_after_dot = matched.slice(0, 0); + (digits_after_dot, exp_is_negative, exp_digits) + }, + ), + )), )), - )) + Self::peek_stop_character, + ) .map( |(maybe_sign, leading_digits, (digits_after_dot, exponent_is_negative, exp_digits))| { let is_negative = maybe_sign.is_some(); @@ -971,14 +1018,18 @@ impl<'data> TextBufferView<'data> { (leading_digits.len() + 1 + trailing_digits_length) as u16 } }; - let trailing_digits_length = digits_after_dot.len() as u16; + let num_trailing_digits = digits_after_dot + .bytes() + .iter() + .filter(|b| b.is_ascii_digit()) + .count() as u16; let exponent_digits_offset = (exp_digits.offset() - self.offset()) as u16; let exponent_digits_length = exp_digits.len() as u16; MatchedDecimal::new( is_negative, digits_offset, digits_length, - trailing_digits_length, + num_trailing_digits, exponent_is_negative, exponent_digits_offset, exponent_digits_length, @@ -1009,7 +1060,7 @@ impl<'data> TextBufferView<'data> { /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. pub(crate) fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { - Self::match_text_until_unescaped(self, b'\"') + Self::match_text_until_unescaped(self, b'\"', false) } /// Matches a long string comprised of any number of `'''`-enclosed segments interleaved @@ -1041,7 +1092,11 @@ impl<'data> TextBufferView<'data> { /// Matches a single long string segment enclosed by `'''` delimiters. pub fn match_long_string_segment(self) -> IonParseResult<'data, (Self, bool)> { - delimited(tag("'''"), Self::match_long_string_segment_body, tag("'''"))(self) + delimited( + complete_tag("'''"), + Self::match_long_string_segment_body, + complete_tag("'''"), + )(self) } /// Matches all input up to (but not including) the first unescaped instance of `'''`. @@ -1079,7 +1134,7 @@ impl<'data> TextBufferView<'data> { // Note that symbol ID integers: // * CANNOT have underscores in them. For example: `$1_0` is considered an identifier. // * CAN have leading zeros. There's precedent for this in ion-java. - preceded(tag("$"), digit1), + preceded(tag("$"), complete_digit1), // Peek at the next character to make sure it's unrelated to the symbol ID. // The spec does not offer a formal definition of what ends a symbol ID. // This checks for either a stop_character (which performs its own `peek()`) @@ -1103,7 +1158,7 @@ impl<'data> TextBufferView<'data> { Self::identifier_initial_character, Self::identifier_trailing_characters, ), - not(Self::identifier_trailing_character), + Self::identifier_terminator, ))(self)?; // Ion defines a number of keywords that are syntactically indistinguishable from // identifiers. Keywords take precedence; we must ensure that any identifier we find @@ -1125,6 +1180,13 @@ impl<'data> TextBufferView<'data> { Ok((remaining, MatchedSymbol::Identifier)) } + fn identifier_terminator(self) -> IonMatchResult<'data> { + alt(( + eof, + recognize(peek(not(Self::identifier_trailing_character))), + ))(self) + } + /// Matches any character that can appear at the start of an identifier. fn identifier_initial_character(self) -> IonParseResult<'data, Self> { recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self) @@ -1137,7 +1199,7 @@ impl<'data> TextBufferView<'data> { /// Matches characters that are legal in an identifier, though not necessarily at the beginning. fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> { - recognize(many0_count(Self::identifier_trailing_character))(self) + complete_take_while(|c: u8| c.is_ascii_alphanumeric() || b"$_".contains(&c))(self) } /// Matches a quoted symbol (`'foo'`). @@ -1156,41 +1218,76 @@ impl<'data> TextBufferView<'data> { /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> { - Self::match_text_until_unescaped(self, b'\'') + Self::match_text_until_unescaped(self, b'\'', false) } /// A helper method for matching bytes until the specified delimiter. Ignores any byte /// (including the delimiter) that is prefaced by the escape character `\`. - fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> { - let mut is_escaped = false; + fn match_text_until_unescaped( + self, + delimiter: u8, + allow_unescaped_newlines: bool, + ) -> IonParseResult<'data, (Self, bool)> { let mut contains_escaped_chars = false; - for (index, byte) in self.bytes().iter().enumerate() { - if is_escaped { - // If we're escaped, the previous byte was a \ and we ignore this one. - is_escaped = false; - continue; - } - if *byte == b'\\' { - is_escaped = true; - contains_escaped_chars = true; - continue; - } - if *byte == b'\r' { - // If the text contains an unescaped carriage return, we may need to normalize it. - // In some narrow cases, setting this flag to true may result in a sanitization buffer - // being allocated when it isn't strictly necessary. + // This de-sugared syntax allows us to modify `iter` mid-loop. + let mut iter = self.bytes().iter().copied().enumerate(); + while let Some((index, byte)) = iter.next() { + if byte == b'\\' { + // It's an escape sequence. For the purposes of finding the end delimiter, we can + // skip the next 1 byte unless this is \r\n, in which case we need to skip two. + // Other escape sequences that are followed by more than one byte (e.g. \u and \U) + // are always followed by ASCII letters, which aren't used as delimiters. contains_escaped_chars = true; + // Peek at the next two bytes to see if this is a \r\n + let next_two_bytes = self.bytes().get(index + 1..index + 3); + let bytes_to_skip = if next_two_bytes == Some(&[b'\r', b'\n']) { + 2 + } else { + 1 + }; + // Eagerly skip the next iterator values + let _ = iter.nth(bytes_to_skip - 1); continue; } - if *byte == delimiter { + if byte == delimiter { let matched = self.slice(0, index); let remaining = self.slice_to_end(index); return Ok((remaining, (matched, contains_escaped_chars))); } + // If this is a control character, make sure it's a legal one. + if byte < 0x20 { + if byte == b'\r' { + // Carriage returns are not actual escapes, but do require a substitution + // as part of newline normalization when the string is read. + contains_escaped_chars = true; + } else { + self.validate_string_control_character(byte, index, allow_unescaped_newlines)?; + } + } } Err(nom::Err::Incomplete(Needed::Unknown)) } + #[cold] + fn validate_string_control_character( + self, + byte: u8, + index: usize, + allow_unescaped_newlines: bool, + ) -> IonParseResult<'data, ()> { + if byte == b'\n' && !allow_unescaped_newlines { + let error = InvalidInputError::new(self.slice_to_end(index)) + .with_description("unescaped newlines are not allowed in short string literals"); + return Err(nom::Err::Failure(IonParseError::Invalid(error))); + } + if !WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&byte) { + let error = InvalidInputError::new(self.slice_to_end(index)) + .with_description("unescaped control characters are not allowed in text literals"); + return Err(nom::Err::Failure(IonParseError::Invalid(error))); + } + Ok((self.slice_to_end(1), ())) + } + /// A helper method for matching bytes until the specified delimiter. Ignores any byte /// that is prefaced by the escape character `\`. /// @@ -1211,7 +1308,7 @@ impl<'data> TextBufferView<'data> { // `match_text_until_escaped` does NOT include the delimiter byte in the match, // so `remaining_after_match` starts at the delimiter byte. let (remaining_after_match, (_, segment_contained_escapes)) = - remaining.match_text_until_unescaped(delimiter_head)?; + remaining.match_text_until_unescaped(delimiter_head, true)?; contained_escapes |= segment_contained_escapes; remaining = remaining_after_match; @@ -1249,7 +1346,7 @@ impl<'data> TextBufferView<'data> { fn match_timestamp_y(self) -> IonParseResult<'data, MatchedTimestamp> { terminated( Self::match_timestamp_year, - pair(tag("T"), Self::peek_stop_character), + pair(complete_tag("T"), Self::peek_stop_character), ) .map(|_year| MatchedTimestamp::new(TimestampPrecision::Year)) .parse(self) @@ -1259,7 +1356,7 @@ impl<'data> TextBufferView<'data> { fn match_timestamp_ym(self) -> IonParseResult<'data, MatchedTimestamp> { terminated( pair(Self::match_timestamp_year, Self::match_timestamp_month), - pair(tag("T"), Self::peek_stop_character), + pair(complete_tag("T"), Self::peek_stop_character), ) .map(|(_year, _month)| MatchedTimestamp::new(TimestampPrecision::Month)) .parse(self) @@ -1273,7 +1370,7 @@ impl<'data> TextBufferView<'data> { Self::match_timestamp_month, Self::match_timestamp_day, )), - pair(opt(tag("T")), Self::peek_stop_character), + pair(opt(complete_tag("T")), Self::peek_stop_character), ) .map(|_| MatchedTimestamp::new(TimestampPrecision::Day)) .parse(self) @@ -1344,10 +1441,10 @@ impl<'data> TextBufferView<'data> { /// Matches the month component of a timestamp, including a leading `-`. fn match_timestamp_month(self) -> IonMatchResult<'data> { preceded( - tag("-"), + complete_tag("-"), recognize(alt(( - pair(char('0'), one_of("123456789")), - pair(char('1'), one_of("012")), + pair(complete_char('0'), complete_one_of("123456789")), + pair(complete_char('1'), complete_one_of("012")), ))), )(self) } @@ -1355,11 +1452,11 @@ impl<'data> TextBufferView<'data> { /// Matches the day component of a timestamp, including a leading `-`. fn match_timestamp_day(self) -> IonMatchResult<'data> { preceded( - tag("-"), + complete_tag("-"), recognize(alt(( - pair(char('0'), one_of("123456789")), - pair(one_of("12"), Self::match_any_digit), - pair(char('3'), one_of("01")), + pair(complete_char('0'), complete_one_of("123456789")), + pair(complete_one_of("12"), Self::match_any_digit), + pair(complete_char('3'), complete_one_of("01")), ))), )(self) } @@ -1374,13 +1471,13 @@ impl<'data> TextBufferView<'data> { separated_pair( // Hour recognize(alt(( - pair(one_of("01"), Self::match_any_digit), - pair(char('2'), one_of("0123")), + pair(complete_one_of("01"), Self::match_any_digit), + pair(complete_char('2'), complete_one_of("0123")), ))), // Delimiter - tag(":"), + complete_tag(":"), // Minutes - recognize(pair(one_of("012345"), Self::match_any_digit)), + recognize(pair(complete_one_of("012345"), Self::match_any_digit)), ), )(self) } @@ -1388,24 +1485,27 @@ impl<'data> TextBufferView<'data> { /// Matches a leading `:`, and any two-digit second component from `00` to `59` inclusive. fn match_timestamp_seconds(self) -> IonMatchResult<'data> { preceded( - tag(":"), - recognize(pair(one_of("012345"), Self::match_any_digit)), + complete_tag(":"), + recognize(pair(complete_one_of("012345"), Self::match_any_digit)), )(self) } /// Matches the fractional seconds component of a timestamp, including a leading `.`. fn match_timestamp_fractional_seconds(self) -> IonMatchResult<'data> { - preceded(tag("."), digit1)(self) + preceded(complete_tag("."), digit1)(self) } /// Matches a timestamp offset of any format. fn match_timestamp_offset(self) -> IonParseResult<'data, MatchedTimestampOffset> { alt(( - value(MatchedTimestampOffset::Zulu, tag("Z")), - value(MatchedTimestampOffset::Zulu, tag("+00:00")), - value(MatchedTimestampOffset::Unknown, tag("-00:00")), + value(MatchedTimestampOffset::Zulu, complete_tag("Z")), + value(MatchedTimestampOffset::Zulu, complete_tag("+00:00")), + value(MatchedTimestampOffset::Unknown, complete_tag("-00:00")), map( - pair(one_of("-+"), Self::match_timestamp_offset_hours_and_minutes), + pair( + complete_one_of("-+"), + Self::match_timestamp_offset_hours_and_minutes, + ), |(sign, (hours, _minutes))| { let is_negative = sign == '-'; let hours_offset = hours.offset(); @@ -1424,13 +1524,13 @@ impl<'data> TextBufferView<'data> { separated_pair( // Hour recognize(alt(( - pair(one_of("01"), Self::match_any_digit), - pair(char('2'), one_of("0123")), + pair(complete_one_of("01"), Self::match_any_digit), + pair(complete_char('2'), complete_one_of("0123")), ))), // Delimiter - tag(":"), + complete_tag(":"), // Minutes - recognize(pair(one_of("012345"), Self::match_any_digit)), + recognize(pair(complete_one_of("012345"), Self::match_any_digit)), )(self) } @@ -1489,7 +1589,7 @@ impl<'data> TextBufferView<'data> { fn validate_clob_text(self) -> IonMatchResult<'data> { for byte in self.bytes().iter().copied() { if !Self::byte_is_legal_clob_ascii(byte) { - let message = format!("found an illegal byte '{:0x}'in clob", byte); + let message = format!("found an illegal byte '{:0x}' in clob", byte); let error = InvalidInputError::new(self).with_description(message); return Err(nom::Err::Failure(IonParseError::Invalid(error))); } @@ -1759,7 +1859,7 @@ mod tests { /// contents of the input are considered a complete token. fn new(input: &str) -> Self { MatchTest { - input: format!("{input}\n0"), // add whitespace and a trailing value + input: input.to_string(), } } @@ -1776,11 +1876,12 @@ mod tests { P: Parser, O, IonParseError<'data>>, { let result = self.try_match(parser); - let (_remaining, match_length) = result.unwrap(); + let (_remaining, match_length) = result + .unwrap_or_else(|_| panic!("Unexpected parse fail for input '{}'", self.input)); // Inputs have a trailing newline and `0` that should _not_ be part of the match assert_eq!( match_length, - self.input.len() - 2, + self.input.len(), "\nInput: '{}'\nMatched: '{}'\n", self.input, &self.input[..match_length] @@ -1797,8 +1898,8 @@ mod tests { if let Ok((_remaining, match_length)) = result { assert_ne!( match_length, - self.input.len() - 1, - "parser unexpectedly matched the complete input: '{:?}\nResult: {:?}", + self.input.len(), + "parser unexpectedly matched the complete input: {:?}\nResult: {:?}", self.input, result ); @@ -1809,6 +1910,7 @@ mod tests { #[test] fn test_match_stop_char() { MatchTest::new(" ").expect_match(match_length(TextBufferView::match_stop_character)); + MatchTest::new("").expect_match(match_length(TextBufferView::match_stop_character)); } #[test] @@ -1961,7 +2063,7 @@ mod tests { let good_inputs = &[ "0.0e0", "0E0", "0e0", "305e1", "305e+1", "305e-1", "305e100", "305e-100", "305e+100", - "305.0e1", "0.279e3", "279e0", "279.5e0", "279.5E0", + "305.0e1", "0.279e3", "0.279e-3", "279e0", "279.5e0", "279.5E0", ]; for input in good_inputs { match_float(input); @@ -2147,7 +2249,7 @@ mod tests { match_annotated_value(input); } - let bad_inputs = &["foo", "foo:bar", "foo:::bar"]; + let bad_inputs = &["foo::", "foo:bar", "foo:::bar"]; for input in bad_inputs { mismatch_annotated_value(input); } @@ -2169,7 +2271,8 @@ mod tests { } let bad_inputs = &[ - "5", "5d", "05d", "-5d", "5.d", "-5.d", "5.D", "-5.D", "-5.0+0", + "123._456", "5", "5d", "05d", "-5d", "5.d", "-5.d", "5.D", "-5.D", "5.1d", "-5.1d", + "5.1D", "-5.1D", "-5.0+0", ]; for input in bad_inputs { mismatch_decimal(input); diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 970c0aa7..066d4ab0 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -1,5 +1,5 @@ use crate::lazy::text::buffer::TextBufferView; -use crate::lazy::text::matched::{MatchedSymbol, MatchedValue}; +use crate::lazy::text::matched::{MatchedFieldName, MatchedValue}; use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef}; use std::ops::Range; @@ -63,7 +63,7 @@ pub(crate) struct EncodedTextValue { // recognized during matching and partial information like subfield offsets can be stored here. matched_value: MatchedValue, - field_name_syntax: Option, + field_name_syntax: Option, } impl EncodedTextValue { @@ -93,7 +93,7 @@ impl EncodedTextValue { // $10 pub(crate) fn with_field_name( mut self, - field_name_syntax: MatchedSymbol, + field_name_syntax: MatchedFieldName, offset: usize, length: usize, ) -> EncodedTextValue { @@ -203,7 +203,7 @@ impl EncodedTextValue { self.data_length + u32::max(self.annotations_offset, self.field_name_offset) as usize } - pub fn field_name_syntax(&self) -> Option { + pub fn field_name_syntax(&self) -> Option { self.field_name_syntax } @@ -215,6 +215,7 @@ impl EncodedTextValue { #[cfg(test)] mod tests { use super::*; + use crate::lazy::text::matched::MatchedSymbol; #[test] fn total_length_data_only() { @@ -225,7 +226,7 @@ mod tests { #[test] fn total_length_data_with_field_name() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(MatchedSymbol::Identifier, 90, 4); + .with_field_name(MatchedFieldName::Symbol(MatchedSymbol::Identifier), 90, 4); assert_eq!(value.total_length(), 22); } @@ -239,13 +240,13 @@ mod tests { #[test] fn total_length_data_with_field_name_and_annotations() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(MatchedSymbol::Identifier, 90, 4) + .with_field_name(MatchedFieldName::Symbol(MatchedSymbol::Identifier), 90, 4) .with_annotations_sequence(94, 6); assert_eq!(value.total_length(), 22); // Same test but with extra whitespace between the components let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(MatchedSymbol::Identifier, 80, 4) + .with_field_name(MatchedFieldName::Symbol(MatchedSymbol::Identifier), 80, 4) .with_annotations_sequence(91, 6); assert_eq!(value.total_length(), 32, "{:?}", value); } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index abd7c233..7c90df62 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -23,9 +23,11 @@ use std::borrow::Cow; use std::num::IntErrorKind; use std::str::FromStr; +use nom::branch::alt; +use nom::bytes::streaming::tag; use nom::character::is_hex_digit; use nom::sequence::preceded; -use nom::AsChar; +use nom::{AsChar, Parser}; use num_bigint::{BigInt, BigUint}; use num_traits::Num; use smallvec::SmallVec; @@ -61,6 +63,26 @@ pub(crate) enum MatchedValue { Struct, } +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) enum MatchedFieldName { + Symbol(MatchedSymbol), + String(MatchedString), +} + +impl MatchedFieldName { + pub fn read<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + match self { + MatchedFieldName::Symbol(matched_symbol) => matched_symbol.read(matched_input), + MatchedFieldName::String(matched_string) => { + matched_string.read(matched_input).map(|s| s.into()) + } + } + } +} + /// A partially parsed Ion int. #[derive(Copy, Clone, Debug, PartialEq)] pub(crate) struct MatchedInt { @@ -128,7 +150,11 @@ impl MatchedInt { } }; - Ok(int) + if self.is_negative { + Ok(-int) + } else { + Ok(int) + } } } @@ -178,7 +204,7 @@ pub(crate) struct MatchedDecimal { is_negative: bool, digits_offset: u16, digits_length: u16, - trailing_digits_length: u16, + num_trailing_digits: u16, exponent_is_negative: bool, exponent_digits_offset: u16, exponent_digits_length: u16, @@ -192,7 +218,7 @@ impl MatchedDecimal { is_negative: bool, digits_offset: u16, digits_length: u16, - trailing_digits_length: u16, + num_trailing_digits: u16, exponent_is_negative: bool, exponent_offset: u16, exponent_length: u16, @@ -201,7 +227,7 @@ impl MatchedDecimal { is_negative, digits_offset, digits_length, - trailing_digits_length, + num_trailing_digits, exponent_is_negative, exponent_digits_offset: exponent_offset, exponent_digits_length: exponent_length, @@ -273,7 +299,7 @@ impl MatchedDecimal { } }; - exponent -= self.trailing_digits_length as i64; + exponent -= self.num_trailing_digits as i64; Ok(Decimal::new(coefficient, exponent)) } @@ -365,7 +391,7 @@ impl MatchedString { // This is the same parser that matched the input initially, which means that the only // reason it wouldn't succeed here is if the input is empty, meaning we're done reading. while let Ok((remaining_after_match, (segment_body, _has_escapes))) = preceded( - TextBufferView::match_optional_whitespace, + TextBufferView::match_optional_comments_and_whitespace, TextBufferView::match_long_string_segment, )(remaining) { @@ -519,25 +545,36 @@ fn decode_escape_into_bytes<'data>( b'b' => 0x08u8, // backspace b'v' => 0x0Bu8, // vertical tab b'f' => 0x0Cu8, // form feed - // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. - b'\n' => return Ok(input_after_escape), + // If the bytes following the '\' are an unescaped CR/LF, discard both. + b'\r' if input_after_escape.bytes().first() == Some(&b'\n') => { + return Ok(input_after_escape.slice_to_end(1)) + } + // If the next byte is a CR or LF, discard it. + b'\r' | b'\n' => return Ok(input_after_escape), // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes - b'x' => return decode_hex_digits_escape(2, input_after_escape, sanitized), - // Clobs represent text of some encoding, but it may or may not be a flavor of Unicode. - // As such, clob syntax does not support Unicode escape sequences like `\u` or `\U`. - b'u' if support_unicode_escapes => { - return decode_hex_digits_escape(4, input_after_escape, sanitized) + b'x' => { + return decode_hex_digits_escape( + 2, + input_after_escape, + sanitized, + support_unicode_escapes, + ) } - b'U' if support_unicode_escapes => { - return decode_hex_digits_escape(8, input_after_escape, sanitized) + b'u' => { + return decode_hex_digits_escape( + 4, + input_after_escape, + sanitized, + support_unicode_escapes, + ) } - b'u' | b'U' => { - return Err(IonError::Decoding( - DecodingError::new( - "Unicode escape sequences (\\u, \\U) are not legal in this context", - ) - .with_position(input.offset()), - )) + b'U' => { + return decode_hex_digits_escape( + 8, + input_after_escape, + sanitized, + support_unicode_escapes, + ) } _ => { return Err(IonError::Decoding( @@ -557,6 +594,7 @@ fn decode_hex_digits_escape<'data>( num_digits: usize, input: TextBufferView<'data>, sanitized: &mut Vec, + support_unicode_escapes: bool, ) -> IonResult> { if input.len() < num_digits { return Err(IonError::Decoding( @@ -569,6 +607,15 @@ fn decode_hex_digits_escape<'data>( )); } + // Clobs represent text of some encoding, but it may or may not be a flavor of Unicode. + // As such, clob syntax does not support Unicode escape sequences like `\u` or `\U`. + if num_digits != 2 && !support_unicode_escapes { + return Err(IonError::Decoding( + DecodingError::new("Unicode escape sequences (\\u, \\U) are not legal in this context") + .with_position(input.offset()), + )); + } + let hex_digit_bytes = &input.bytes()[..num_digits]; let all_are_hex_digits = hex_digit_bytes @@ -588,38 +635,28 @@ fn decode_hex_digits_escape<'data>( // Isolate the portion of the input that follows the hex digits so we can return it. let remaining_input = input.slice_to_end(num_digits); - // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. - // We can unwrap() in each case. + // We just confirmed all of the digits are ASCII hex digits, so this step cannot fail. let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); - // If this was a '\x' escape, we cannot interpret the hex digits as a Unicode scalar. We treat - // it as a byte literal instead. - if num_digits == 2 { - let byte = u8::from_str_radix(hex_digits, 16).unwrap(); - sanitized.push(byte); + + if !support_unicode_escapes { + // Inside a clob, \x is a byte literal, not a Unicode code point. + let byte_literal = u8::from_str_radix(hex_digits, 16).unwrap(); + sanitized.push(byte_literal); return Ok(remaining_input); } - // From here on, we know that the escape was either `\u` or `\U`--a Unicode scalar. - // Note that this means we are not processing a clob (which doesn't support Unicode) and can - // further infer that we are working with UTF-8, the only supported encoding for strings/symbols. let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another // unicode escape representing the low surrogate has to be next in the input to complete it. - // See the docs for this helper function for details. (Note: this will only ever be true for - // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a - // high surrogate.) + // See the docs for the `code_point_is_a_high_surrogate` helper function for details. + // (Note: this will only ever be true for 4- and 8-digit escape sequences. `\x` escapes don't + // have enough digits to represent a high surrogate.) if code_point_is_a_high_surrogate(code_point) { // The spec has MAY-style language around supporting high surrogates. Supporting them is - // allowed but discouraged. For the time being, we will return an error. Other implementations - // (notably ion-java) support high surrogates largely for resilience/debugging. We can consider - // adding that support if there is demand for it. - return Err(IonError::Decoding( - DecodingError::new( - "found a Unicode high surrogate; UTF-16 is not legal in Ion strings/symbols", - ) - .with_position(input.offset()), - )); + // allowed but discouraged. The ion-tests spec conformance tests include cases with UTF-16 + // surrogates, so ion-rust supports them. + return complete_surrogate_pair(sanitized, code_point, remaining_input); } // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a @@ -634,6 +671,57 @@ fn decode_hex_digits_escape<'data>( Ok(remaining_input) } +/// Reads another escaped code point from the buffer, treating it as the low surrogate to be paired +/// with the specified high surrogate. Appends the UTF-8 encoding of the resulting Unicode scalar +/// to `sanitized` and returns the remaining text in the buffer. +fn complete_surrogate_pair<'data>( + sanitized: &mut Vec, + high_surrogate: u32, + input: TextBufferView<'data>, +) -> IonResult> { + let mut match_next_codepoint = preceded( + tag("\\"), + alt(( + preceded(tag("x"), TextBufferView::match_n_hex_digits(2)), + preceded(tag("u"), TextBufferView::match_n_hex_digits(4)), + preceded(tag("U"), TextBufferView::match_n_hex_digits(8)), + )), + ); + let (remaining, hex_digits) = match match_next_codepoint.parse(input) { + Ok((remaining, hex_digits)) => (remaining, hex_digits), + Err(_) => { + return { + let error = + DecodingError::new("found a high surrogate not followed by a low surrogate") + .with_position(input.offset()); + Err(IonError::Decoding(error)) + } + } + }; + let high_surrogate = high_surrogate as u16; + + let hex_digits = std::str::from_utf8(hex_digits.bytes()).unwrap(); + let low_surrogate = u16::from_str_radix(hex_digits, 16).map_err(|_| { + let error = + DecodingError::new("low surrogate did not fit in a u16").with_position(input.offset()); + IonError::Decoding(error) + })?; + + let character = char::decode_utf16([high_surrogate, low_surrogate]) + .next() + .unwrap() + .map_err(|_| { + let error = DecodingError::new("encountered invalid surrogate pair") + .with_position(input.offset()); + IonError::Decoding(error) + })?; + + let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; + let utf8_encoded = character.encode_utf8(utf8_buffer); + sanitized.extend_from_slice(utf8_encoded.as_bytes()); + Ok(remaining) +} + /// Returns `true` if the provided code point is a utf-16 high surrogate. /// /// Terse primer: Unicode text is made up of a stream of unsigned integers called 'code points'. @@ -1050,9 +1138,48 @@ impl MatchedClob { #[cfg(test)] mod tests { + use std::str::FromStr; + + use num_bigint::BigInt; + use crate::lazy::bytes_ref::BytesRef; use crate::lazy::text::buffer::TextBufferView; - use crate::{Decimal, IonResult, Timestamp}; + use crate::{Decimal, Int, IonResult, Timestamp}; + + #[test] + fn read_ints() -> IonResult<()> { + fn expect_int(data: &str, expected: impl Into) { + let expected: Int = expected.into(); + let buffer = TextBufferView::new(data.as_bytes()); + let (_remaining, matched) = buffer.match_int().unwrap(); + let actual = matched.read(buffer).unwrap(); + assert_eq!( + actual, expected, + "Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}", + data, actual, expected + ); + } + + let tests = [ + ("-5", Int::from(-5)), + ("0", Int::from(0)), + ( + "1234567890_1234567890_1234567890_1234567890", + Int::from(BigInt::from_str("1234567890_1234567890_1234567890_1234567890").unwrap()), + ), + ( + "-1234567890_1234567890_1234567890_1234567890", + Int::from( + BigInt::from_str("-1234567890_1234567890_1234567890_1234567890").unwrap(), + ), + ), + ]; + + for (input, expected) in tests { + expect_int(input, expected); + } + Ok(()) + } #[test] fn read_timestamps() -> IonResult<()> { @@ -1159,10 +1286,21 @@ mod tests { #[test] fn read_decimals() -> IonResult<()> { fn expect_decimal(data: &str, expected: Decimal) { - let data = format!("{data} "); // Append a space let buffer = TextBufferView::new(data.as_bytes()); - let (_remaining, matched) = buffer.match_decimal().unwrap(); - let actual = matched.read(buffer).unwrap(); + let result = buffer.match_decimal(); + assert!( + result.is_ok(), + "Unexpected match error for input: '{data}': {:?}", + result + ); + let (_remaining, matched) = buffer.match_decimal().expect("match decimal"); + let result = matched.read(buffer); + assert!( + result.is_ok(), + "Unexpected read error for input '{data}': {:?}", + result + ); + let actual = result.unwrap(); assert_eq!( actual, expected, "Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}", @@ -1179,8 +1317,6 @@ mod tests { ("-5.d0", Decimal::new(-5, 0)), ("5.0", Decimal::new(50, -1)), ("-5.0", Decimal::new(-50, -1)), - ("5.0d", Decimal::new(50, -1)), - ("-5.0d", Decimal::new(-50, -1)), ("500d0", Decimal::new(5, 2)), ("-500d0", Decimal::new(-5, 2)), ("0.005", Decimal::new(5, -3)), @@ -1259,14 +1395,11 @@ mod tests { (r"'''he''' '''llo'''", "hello"), (r#""😎🙂🙃""#, "😎🙂🙃"), (r"'''😎🙂''' '''🙃'''", "😎🙂🙃"), - // The below bytes are the UTF-8 encoding of Unicode code points: U+2764 U+FE0F - (r#""\xe2\x9d\xa4\xef\xb8\x8f""#, "❤️"), - (r"'''\xe2\x9d\xa4\xef\xb8\x8f'''", "❤️"), (r"'''\u2764\uFE0F'''", "❤️"), (r"'''\U00002764\U0000FE0F'''", "❤️"), - // In short strings, unescaped newlines are not normalized. - ("\"foo\rbar\r\nbaz\"", "foo\rbar\r\nbaz"), - // In long-form strings, unescaped newlines converted to `\n`. + // In short strings, carriage returns are not normalized. + ("\"foo\rbar\rbaz\"", "foo\rbar\rbaz"), + // In long-form strings, all unescaped newlines are converted to `\n`. ("'''foo\rbar\r\nbaz'''", "foo\nbar\nbaz"), ]; @@ -1297,7 +1430,13 @@ mod tests { } fn expect_clob(data: &str, expected: &str) { - let actual = read_clob(data).unwrap(); + let result = read_clob(data); + assert!( + result.is_ok(), + "Unexpected read failure for input '{data}': {:?}", + result + ); + let actual = result.unwrap(); assert_eq!( actual, expected.as_ref(), @@ -1319,7 +1458,8 @@ mod tests { (r#"{{"hello"}}"#, "hello"), (r#"{{"\x4D"}}"#, "M"), (r#"{{"\x4d \x4d \x4d"}}"#, "M M M"), - // The below bytes are the UTF-8 encoding of Unicode code points: U+2764 U+FE0F + (r"{{'''\x4d''' '''\x4d''' '''\x4d'''}}", "MMM"), + // The below byte literals are the UTF-8 encoding of Unicode code points: U+2764 U+FE0F (r#"{{"\xe2\x9d\xa4\xef\xb8\x8f"}}"#, "❤️"), (r#"{{'''hel''' '''lo'''}}"#, "hello"), ( @@ -1333,8 +1473,8 @@ mod tests { ), // In a long-form clob, unescaped `\r` and `\r\n` are normalized into unescaped `\n` ("{{'''foo\rbar\r\nbaz'''}}", "foo\nbar\nbaz"), - // In a short-form clob, newlines are not normalized. - ("{{\"foo\rbar\r\nbaz\"}}", "foo\rbar\r\nbaz"), + // In a short-form clob, carriage returns are not normalized. + ("{{\"foo\rbar\rbaz\"}}", "foo\rbar\rbaz"), ]; for (input, expected) in tests { diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 5e6254e6..b8d497df 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -55,6 +55,14 @@ impl<'data> LazyRawTextReader<'data> { let (remaining, matched) = buffer_after_whitespace .match_top_level_item() .with_context("reading a top-level value", buffer_after_whitespace)?; + + if let RawStreamItem::VersionMarker(major, minor) = matched { + if (major, minor) != (1, 0) { + return IonResult::decoding_error(format!( + "Ion version {major}.{minor} is not supported" + )); + } + } // Since we successfully matched the next value, we'll update the buffer // so a future call to `next()` will resume parsing the remaining input. self.buffer = remaining; @@ -191,8 +199,9 @@ mod tests { // Second item 2 /*comment before comma*/, // Third item - 3 + 3, // Final trailing comma ] + // S-Expression ( diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs index aee5d58e..3a0269ef 100644 --- a/src/lazy/text/raw/sequence.rs +++ b/src/lazy/text/raw/sequence.rs @@ -26,8 +26,9 @@ impl<'data> LazyRawTextList<'data> { } pub fn iter(&self) -> RawTextListIterator<'data> { + let open_bracket_index = self.value.encoded_value.data_offset() - self.value.input.offset(); // Make an iterator over the input bytes that follow the initial `[` - RawTextListIterator::new(self.value.input.slice_to_end(1)) + RawTextListIterator::new(self.value.input.slice_to_end(open_bracket_index + 1)) } } @@ -114,9 +115,17 @@ impl<'data> RawTextListIterator<'data> { // ...or there aren't values, so it's just the input after the opening delimiter. self.input }; - let (input_after_ws, _ws) = input_after_last - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a list", input_after_last)?; + let (mut input_after_ws, _ws) = + input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a list", input_after_last)?; + // Skip an optional comma and more whitespace + if input_after_ws.bytes().first() == Some(&b',') { + (input_after_ws, _) = input_after_ws + .slice_to_end(1) + .match_optional_comments_and_whitespace() + .with_context("skipping a list's trailing comma", input_after_ws)?; + } let (input_after_end, _end_delimiter) = satisfy(|c| c == ']')(input_after_ws) .with_context("seeking the closing delimiter of a list", input_after_ws)?; let end = input_after_end.offset(); @@ -136,7 +145,10 @@ impl<'data> Iterator for RawTextListIterator<'data> { self.input = remaining; Some(Ok(value)) } - Ok((_remaining, None)) => None, + Ok((_remaining, None)) => { + // Don't update `remaining` so subsequent calls will continue to return None + None + } Err(e) => { self.has_returned_error = true; e.with_context("reading the next list value", self.input) @@ -159,8 +171,9 @@ impl<'data> LazyRawTextSExp<'data> { } pub fn iter(&self) -> RawTextSExpIterator<'data> { + let open_paren_index = self.value.encoded_value.data_offset() - self.value.input.offset(); // Make an iterator over the input bytes that follow the initial `(` - RawTextSExpIterator::new(self.value.input.slice_to_end(1)) + RawTextSExpIterator::new(self.value.input.slice_to_end(open_paren_index + 1)) } } diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs index 44ddf14b..f73030ef 100644 --- a/src/lazy/text/raw/struct.rs +++ b/src/lazy/text/raw/struct.rs @@ -39,9 +39,17 @@ impl<'data> RawTextStructIterator<'data> { // ...or there aren't fields, so it's just the input after the opening delimiter. self.input }; - let (input_after_ws, _ws) = input_after_last - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a struct", input_after_last)?; + let (mut input_after_ws, _ws) = + input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a struct", input_after_last)?; + // Skip an optional comma and more whitespace + if input_after_ws.bytes().first() == Some(&b',') { + (input_after_ws, _) = input_after_ws + .slice_to_end(1) + .match_optional_comments_and_whitespace() + .with_context("skipping a list's trailing comma", input_after_ws)?; + } let (input_after_end, _end_delimiter) = satisfy(|c| c == b'}' as char)(input_after_ws) .with_context("seeking the closing delimiter of a struct", input_after_ws)?; let end = input_after_end.offset(); @@ -173,8 +181,9 @@ impl<'data> LazyRawStruct<'data, TextEncoding> for LazyRawTextStruct<'data> { } fn iter(&self) -> Self::Iterator { + let open_brace_index = self.value.encoded_value.data_offset() - self.value.input.offset(); // Slice the input to skip the opening `{` - RawTextStructIterator::new(self.value.input.slice_to_end(1)) + RawTextStructIterator::new(self.value.input.slice_to_end(open_brace_index + 1)) } } diff --git a/src/lazy/value.rs b/src/lazy/value.rs index 26ed7ba5..7898bd1b 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; use crate::lazy::encoding::BinaryEncoding; use crate::lazy::r#struct::LazyStruct; @@ -9,7 +11,6 @@ use crate::{ Annotations, Element, IntoAnnotatedElement, IonError, IonResult, IonType, RawSymbolTokenRef, SymbolRef, SymbolTable, Value, }; -use std::borrow::Cow; /// A value in a binary Ion stream whose header has been parsed but whose body (i.e. its data) has /// not. A `LazyValue` is immutable; its data can be read any number of times. @@ -359,12 +360,13 @@ impl<'top, 'data, D: LazyDecoder<'data>> TryFrom IonResult<()> { diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index 2c47907d..5b6c3eb0 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -5,7 +5,7 @@ use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::{LazyList, LazySExp}; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; -use crate::{Decimal, Int, IonError, IonResult, IonType, SymbolRef, Timestamp}; +use crate::{Decimal, Element, Int, IonError, IonResult, IonType, SymbolRef, Timestamp}; use std::fmt::{Debug, Formatter}; /// A [ValueRef] represents a value that has been read from the input stream. Scalar variants contain @@ -97,6 +97,15 @@ impl<'top, 'data, D: LazyDecoder<'data>> TryFrom> for V } } +impl<'top, 'data, D: LazyDecoder<'data>> TryFrom> for Element { + type Error = IonError; + + fn try_from(value_ref: ValueRef<'top, 'data, D>) -> Result { + let value: Value = value_ref.try_into()?; + Ok(value.into()) + } +} + impl<'top, 'data, D: LazyDecoder<'data>> ValueRef<'top, 'data, D> { pub fn expect_null(self) -> IonResult { if let ValueRef::Null(ion_type) = self { @@ -166,7 +175,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> ValueRef<'top, 'data, D> { if let ValueRef::Symbol(s) = self { Ok(s) } else { - IonResult::decoding_error("expected a symbol") + IonResult::decoding_error(format!("expected a symbol, found {:?}", self)) } } diff --git a/src/raw_symbol_token_ref.rs b/src/raw_symbol_token_ref.rs index 961bab97..67b54ffe 100644 --- a/src/raw_symbol_token_ref.rs +++ b/src/raw_symbol_token_ref.rs @@ -9,6 +9,18 @@ pub enum RawSymbolTokenRef<'a> { Text(Cow<'a, str>), } +impl<'a> RawSymbolTokenRef<'a> { + /// Returns `true` if this token matches either the specified symbol ID or text value. + /// This is useful for comparing tokens that represent system symbol values of an unknown + /// encoding. + pub fn matches_sid_or_text(&self, symbol_id: SymbolId, symbol_text: &str) -> bool { + match self { + RawSymbolTokenRef::SymbolId(sid) => symbol_id == *sid, + RawSymbolTokenRef::Text(text) => symbol_text == text, + } + } +} + /// Implemented by types that can be viewed as a [RawSymbolTokenRef] without allocations. pub trait AsRawSymbolTokenRef { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef; diff --git a/src/text/parsers/clob.rs b/src/text/parsers/clob.rs index 860d8d5b..3d322913 100644 --- a/src/text/parsers/clob.rs +++ b/src/text/parsers/clob.rs @@ -208,7 +208,7 @@ mod clob_parsing_tests { // parse tests for long clob parse_equals("{{'''Hello''' '''world'''}}", "Helloworld"); parse_equals("{{'''Hello world'''}}", "Hello world"); - parse_equals("{{'''\\xe2\\x9d\\xa4\\xef\\xb8\\x8f\'''}}", "❤️"); + parse_equals(r"{{'''\xe2\x9d\xa4\xef\xb8\x8f'''}}", "❤️"); // Clobs represent text of some encoding, but it may or may not be a flavor of Unicode. // As such, clob syntax does not support Unicode escape sequences like `\u` or `\U`. diff --git a/tests/element_test_vectors.rs b/tests/element_test_vectors.rs index ca4c3de8..0be6332e 100644 --- a/tests/element_test_vectors.rs +++ b/tests/element_test_vectors.rs @@ -693,3 +693,75 @@ mod token_native_element_tests { non_equivs(TokenNativeElementApi, file_name) } } + +#[cfg(test)] +mod lazy_element_tests { + use super::*; + use ion_rs::lazy::reader::LazyReader; + + struct LazyReaderElementApi; + + impl ElementApi for LazyReaderElementApi { + type ElementReader<'a> = LazyReader<'a>; + + fn make_reader(data: &[u8]) -> IonResult> { + Ok(LazyReader::new(data)) + } + + fn global_skip_list() -> SkipList { + ELEMENT_GLOBAL_SKIP_LIST + } + + fn read_one_equivs_skip_list() -> SkipList { + &[] + } + + fn round_trip_skip_list() -> SkipList { + ELEMENT_ROUND_TRIP_SKIP_LIST + } + + fn equivs_skip_list() -> SkipList { + ELEMENT_EQUIVS_SKIP_LIST + } + + fn non_equivs_skip_list() -> SkipList { + &[] + } + } + + good_round_trip! { + use LazyReaderElementApi; + fn binary_compact(Format::Binary, Format::Text(TextKind::Compact)); + fn binary_lines(Format::Binary, Format::Text(TextKind::Lines)); + fn binary_pretty(Format::Binary, Format::Text(TextKind::Pretty)); + fn compact_binary(Format::Text(TextKind::Compact), Format::Binary); + fn compact_lines(Format::Text(TextKind::Compact), Format::Text(TextKind::Lines)); + fn compact_pretty(Format::Text(TextKind::Compact), Format::Text(TextKind::Pretty)); + fn lines_binary(Format::Text(TextKind::Lines), Format::Binary); + fn lines_compact(Format::Text(TextKind::Lines), Format::Text(TextKind::Compact)); + fn lines_pretty(Format::Text(TextKind::Lines), Format::Text(TextKind::Pretty)); + fn pretty_binary(Format::Text(TextKind::Pretty), Format::Binary); + fn pretty_compact(Format::Text(TextKind::Pretty), Format::Text(TextKind::Compact)); + fn pretty_lines(Format::Text(TextKind::Pretty), Format::Text(TextKind::Lines)); + } + + #[test_resources("ion-tests/iontestdata/bad/**/*.ion")] + #[test_resources("ion-tests/iontestdata/bad/**/*.10n")] + fn lazy_bad(file_name: &str) { + bad(LazyReaderElementApi, file_name) + } + + #[test_resources("ion-tests/iontestdata/good/equivs/**/*.ion")] + #[test_resources("ion-tests/iontestdata/good/equivs/**/*.10n")] + fn lazy_equivs(file_name: &str) { + equivs(LazyReaderElementApi, file_name) + } + + #[test_resources("ion-tests/iontestdata/good/non-equivs/**/*.ion")] + // no binary files exist and the macro doesn't like empty globs... + // see frehberg/test-generator#12 + //#[test_resources("ion-tests/iontestdata/good/non-equivs/**/*.10n")] + fn lazy_non_equivs(file_name: &str) { + non_equivs(LazyReaderElementApi, file_name) + } +}