diff --git a/Cargo.toml b/Cargo.toml index 5f148b85..54771ff8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ num-bigint = "0.4.3" num-integer = "0.1.44" num-traits = "0.2" arrayvec = "0.7" -smallvec = "1.9.0" +smallvec = {version ="1.9.0", features = ["const_generics"]} digest = { version = "0.9", optional = true } sha2 = { version = "0.9", optional = true } diff --git a/src/lazy/binary/encoding.rs b/src/lazy/binary/encoding.rs deleted file mode 100644 index e26d0b51..00000000 --- a/src/lazy/binary/encoding.rs +++ /dev/null @@ -1,19 +0,0 @@ -use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; -use crate::lazy::binary::raw::lazy_raw_sequence::LazyRawBinarySequence; -use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; -use crate::lazy::binary::raw::reader::LazyRawBinaryReader; -use crate::lazy::binary::raw::value::LazyRawBinaryValue; -use crate::lazy::decoder::LazyDecoder; - -// This type derives trait implementations in order to allow types that contain it to also derive -// trait implementations. -#[derive(Clone, Debug)] -pub struct BinaryEncoding; - -impl<'data> LazyDecoder<'data> for BinaryEncoding { - type Reader = LazyRawBinaryReader<'data>; - type Value = LazyRawBinaryValue<'data>; - type Sequence = LazyRawBinarySequence<'data>; - type Struct = LazyRawBinaryStruct<'data>; - type AnnotationsIterator = RawBinaryAnnotationsIterator<'data>; -} diff --git a/src/lazy/binary/mod.rs b/src/lazy/binary/mod.rs index cfc54e78..93017274 100644 --- a/src/lazy/binary/mod.rs +++ b/src/lazy/binary/mod.rs @@ -2,6 +2,5 @@ mod encoded_value; pub mod immutable_buffer; pub mod raw; -pub(crate) mod encoding; #[cfg(test)] pub(crate) mod test_utilities; diff --git a/src/lazy/binary/raw/mod.rs b/src/lazy/binary/raw/mod.rs index 0861993f..3df82f4d 100644 --- a/src/lazy/binary/raw/mod.rs +++ b/src/lazy/binary/raw/mod.rs @@ -1,5 +1,5 @@ pub mod annotations_iterator; -pub mod lazy_raw_sequence; pub mod reader; +pub mod sequence; pub mod r#struct; pub mod value; diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index da3a983c..77297e54 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -1,7 +1,7 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::LazyRawReader; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::result::IonFailure; use crate::IonResult; diff --git a/src/lazy/binary/raw/lazy_raw_sequence.rs b/src/lazy/binary/raw/sequence.rs similarity index 98% rename from src/lazy/binary/raw/lazy_raw_sequence.rs rename to src/lazy/binary/raw/sequence.rs index 16dbb021..66d26fef 100644 --- a/src/lazy/binary/raw/lazy_raw_sequence.rs +++ b/src/lazy/binary/raw/sequence.rs @@ -1,10 +1,10 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::reader::DataSource; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::LazyRawSequence; +use crate::lazy::encoding::BinaryEncoding; use crate::{IonResult, IonType}; use std::fmt; use std::fmt::{Debug, Formatter}; diff --git a/src/lazy/binary/raw/struct.rs b/src/lazy/binary/raw/struct.rs index 34ca489a..3f82ed16 100644 --- a/src/lazy/binary/raw/struct.rs +++ b/src/lazy/binary/raw/struct.rs @@ -1,10 +1,10 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::reader::DataSource; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; use crate::lazy::decoder::{LazyRawField, LazyRawStruct}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_value_ref::RawValueRef; use crate::raw_symbol_token_ref::AsRawSymbolTokenRef; use crate::{IonResult, RawSymbolTokenRef}; diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 0ffcb028..9ed2340e 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -1,13 +1,13 @@ use crate::binary::int::DecodedInt; use crate::binary::uint::DecodedUInt; use crate::lazy::binary::encoded_value::EncodedValue; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; -use crate::lazy::binary::raw::lazy_raw_sequence::LazyRawBinarySequence; use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; +use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; use crate::lazy::decoder::private::LazyRawValuePrivate; use crate::lazy::decoder::LazyRawValue; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_value_ref::RawValueRef; use crate::result::IonFailure; use crate::types::SymbolId; @@ -35,7 +35,7 @@ impl<'a> Debug for LazyRawBinaryValue<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, - "LazyRawValue {{\n val={:?},\n buf={:?}\n}}\n", + "LazyRawBinaryValue {{\n val={:?},\n buf={:?}\n}}\n", self.encoded_value, self.input ) } @@ -54,6 +54,10 @@ impl<'data> LazyRawValue<'data, BinaryEncoding> for LazyRawBinaryValue<'data> { self.ion_type() } + fn is_null(&self) -> bool { + self.is_null() + } + fn annotations(&self) -> RawBinaryAnnotationsIterator<'data> { self.annotations() } @@ -70,6 +74,10 @@ impl<'data> LazyRawBinaryValue<'data> { self.encoded_value.ion_type() } + pub fn is_null(&self) -> bool { + self.encoded_value.header().is_null() + } + /// Returns `true` if this value has a non-empty annotations sequence; otherwise, returns `false`. fn has_annotations(&self) -> bool { self.encoded_value.has_annotations() @@ -118,7 +126,7 @@ impl<'data> LazyRawBinaryValue<'data> { /// [`LazyRawBinarySequence`] or [`LazyStruct`](crate::lazy::struct::LazyStruct) /// that can be traversed to access the container's contents. pub fn read(&self) -> ValueParseResult<'data, BinaryEncoding> { - if self.encoded_value.header().is_null() { + if self.is_null() { let raw_value_ref = RawValueRef::Null(self.ion_type()); return Ok(raw_value_ref); } diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index 5f784c42..e53ad2d2 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -62,6 +62,7 @@ pub trait LazyRawValue<'data, D: LazyDecoder<'data>>: private::LazyRawValuePrivate<'data> + Clone + Debug { fn ion_type(&self) -> IonType; + fn is_null(&self) -> bool; fn annotations(&self) -> D::AnnotationsIterator; fn read(&self) -> IonResult>; } diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs new file mode 100644 index 00000000..784879ad --- /dev/null +++ b/src/lazy/encoding.rs @@ -0,0 +1,133 @@ +use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; +use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; +use crate::lazy::binary::raw::reader::LazyRawBinaryReader; +use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; +use crate::lazy::binary::raw::value::LazyRawBinaryValue; +use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; +use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawSequence, LazyRawStruct}; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::raw::reader::LazyRawTextReader; +use crate::lazy::text::value::LazyRawTextValue; +use crate::{IonResult, IonType, RawSymbolTokenRef}; +use std::marker::PhantomData; + +// These types derive trait implementations in order to allow types that containing them +// to also derive trait implementations. + +/// The Ion 1.0 binary encoding. +#[derive(Clone, Debug)] +pub struct BinaryEncoding; + +/// The Ion 1.0 text encoding. +#[derive(Clone, Debug)] +pub struct TextEncoding; + +impl<'data> LazyDecoder<'data> for BinaryEncoding { + type Reader = LazyRawBinaryReader<'data>; + type Value = LazyRawBinaryValue<'data>; + type Sequence = LazyRawBinarySequence<'data>; + type Struct = LazyRawBinaryStruct<'data>; + type AnnotationsIterator = RawBinaryAnnotationsIterator<'data>; +} + +// === Placeholders === +// The types below will need to be properly defined in order for the lazy text reader to be complete. +// The exist to satisfy various trait definitions. +#[derive(Debug, Clone)] +pub struct ToDoTextSequence; + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextSequence { + fn from_value(_value: LazyRawTextValue<'data>) -> Self { + todo!() + } +} + +impl<'data> LazyRawSequence<'data, TextEncoding> for ToDoTextSequence { + type Iterator = Box>>>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn ion_type(&self) -> IonType { + todo!() + } + + fn iter(&self) -> Self::Iterator { + todo!() + } + + fn as_value(&self) -> &>::Value { + todo!() + } +} + +#[derive(Debug, Clone)] +pub struct ToDoTextStruct; + +#[derive(Debug, Clone)] +pub struct ToDoTextField; + +impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for ToDoTextField { + fn into_value(self) -> LazyRawTextValue<'data> { + todo!() + } +} + +impl<'data> LazyRawField<'data, TextEncoding> for ToDoTextField { + fn name(&self) -> RawSymbolTokenRef<'data> { + todo!() + } + + fn value(&self) -> &LazyRawTextValue<'data> { + todo!() + } +} + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextStruct { + fn from_value(_value: ::Value) -> Self { + todo!() + } +} + +impl<'data> LazyRawStruct<'data, TextEncoding> for ToDoTextStruct { + type Field = ToDoTextField; + type Iterator = Box>>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn find(&self, _name: &str) -> IonResult>> { + todo!() + } + + fn get(&self, _name: &str) -> IonResult>> { + todo!() + } + + fn iter(&self) -> Self::Iterator { + todo!() + } +} + +#[derive(Debug, Clone)] +pub struct ToDoTextAnnotationsIterator<'data> { + spooky: &'data PhantomData<()>, +} + +impl<'data> Iterator for ToDoTextAnnotationsIterator<'data> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + todo!() + } +} + +impl<'data> LazyDecoder<'data> for TextEncoding { + type Reader = LazyRawTextReader<'data>; + type Value = LazyRawTextValue<'data>; + type Sequence = ToDoTextSequence; + type Struct = ToDoTextStruct; + type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>; +} diff --git a/src/lazy/mod.rs b/src/lazy/mod.rs index c0c3c413..3f42baa8 100644 --- a/src/lazy/mod.rs +++ b/src/lazy/mod.rs @@ -3,6 +3,7 @@ pub mod binary; pub mod decoder; +pub(crate) mod encoding; pub mod raw_stream_item; pub mod raw_value_ref; pub mod reader; @@ -10,5 +11,6 @@ pub mod sequence; pub mod r#struct; pub mod system_reader; pub mod system_stream_item; +pub mod text; pub mod value; pub mod value_ref; diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 11b09bd6..a0da98eb 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -69,6 +69,14 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { } } + pub fn expect_i64(self) -> IonResult { + if let RawValueRef::Int(i) = self { + i.expect_i64() + } else { + IonResult::decoding_error("expected an i64 (int)") + } + } + pub fn expect_float(self) -> IonResult { if let RawValueRef::Float(f) = self { Ok(f) diff --git a/src/lazy/reader.rs b/src/lazy/reader.rs index bd20656c..2f3cfbb4 100644 --- a/src/lazy/reader.rs +++ b/src/lazy/reader.rs @@ -1,8 +1,8 @@ use crate::binary::constants::v1_0::IVM; use crate::element::reader::ElementReader; use crate::element::Element; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::LazyDecoder; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::system_reader::LazySystemReader; use crate::lazy::value::LazyValue; use crate::result::IonFailure; diff --git a/src/lazy/sequence.rs b/src/lazy/sequence.rs index 7f7f810f..b3c830f8 100644 --- a/src/lazy/sequence.rs +++ b/src/lazy/sequence.rs @@ -1,5 +1,5 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::{LazyDecoder, LazyRawSequence, LazyRawValue}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::{Annotations, Element, IntoAnnotatedElement, Sequence, Value}; use crate::{IonError, IonResult, IonType, SymbolTable}; diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index f7347efd..2251b949 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -1,7 +1,7 @@ use crate::element::builders::StructBuilder; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::private::{LazyRawFieldPrivate, LazyRawValuePrivate}; use crate::lazy::decoder::{LazyDecoder, LazyRawStruct}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index 4936fa8a..bdf76de2 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -1,4 +1,4 @@ -use crate::lazy::binary::encoding::BinaryEncoding; +use crate::lazy::encoding::BinaryEncoding; use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef, SymbolTable}; diff --git a/src/lazy/text/as_utf8.rs b/src/lazy/text/as_utf8.rs new file mode 100644 index 00000000..9be4784c --- /dev/null +++ b/src/lazy/text/as_utf8.rs @@ -0,0 +1,33 @@ +use crate::lazy::text::buffer::TextBufferView; +use crate::position::Position; +use crate::result::DecodingError; +use crate::{IonError, IonResult}; +use smallvec::SmallVec; + +/// Attempts to validate a byte sequence as UTF-8 text. If the data is not valid UTF-8, returns +/// an [`IonError`]. +/// +/// The provided `position` is added to the `IonError` that is constructed if the data is not valid. +pub(crate) trait AsUtf8 { + fn as_utf8(&self, position: impl Into) -> IonResult<&str>; +} + +impl AsUtf8 for SmallVec<[u8; N]> { + fn as_utf8(&self, position: impl Into) -> IonResult<&str> { + bytes_as_utf8(self.as_ref(), position) + } +} + +impl<'data> AsUtf8 for TextBufferView<'data> { + fn as_utf8(&self, position: impl Into) -> IonResult<&str> { + bytes_as_utf8(self.bytes(), position) + } +} + +fn bytes_as_utf8(bytes: &[u8], position: impl Into) -> IonResult<&str> { + std::str::from_utf8(bytes).map_err(|_| { + let decoding_error = + DecodingError::new("encountered invalid UTF-8").with_position(position); + IonError::Decoding(decoding_error) + }) +} diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs new file mode 100644 index 00000000..c12ec11c --- /dev/null +++ b/src/lazy/text/buffer.rs @@ -0,0 +1,732 @@ +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_stream_item::RawStreamItem; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::{MatchedInt, MatchedValue}; +use crate::lazy::text::parse_result::IonParseError; +use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::value::LazyRawTextValue; +use crate::{IonResult, IonType}; +use nom::branch::alt; +use nom::bytes::streaming::{is_a, tag, take_while1}; +use nom::character::streaming::{char, digit1, one_of}; +use nom::combinator::{map, opt, peek, recognize, success, value}; +use nom::error::{ErrorKind, ParseError}; +use nom::multi::many0_count; +use nom::sequence::{delimited, pair, preceded, separated_pair, terminated}; +use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; +use std::fmt::{Debug, Formatter}; +use std::iter::{Copied, Enumerate}; +use std::ops::{RangeFrom, RangeTo}; +use std::slice::Iter; + +impl<'a> Debug for TextBufferView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TextBufferView {{")?; + // Try to read the next several bytes from the buffer as UTF-8... + let text_result = std::str::from_utf8(self.data); + // ...if it works, print the first 32 unicode scalars... + if let Ok(text) = text_result { + write!(f, "\"{}...\"", text.chars().take(32).collect::())?; + } else { + // ...if it doesn't, print the first 32 bytes in hex. + write!(f, "Invalid UTF-8")?; + for byte in self.bytes().iter().take(32) { + write!(f, "{:x?} ", *byte)?; + } + if self.bytes().len() > 32 { + write!(f, "...{} more bytes", self.bytes().len() - 32)?; + } + } + write!(f, "}}") + } +} + +/// The Ion specification's enumeration of whitespace characters. +const WHITESPACE_CHARACTERS: &[char] = &[ + ' ', // Space + '\t', // Tab + '\r', // Carriage return + '\n', // Newline + '\x09', // Horizontal tab + '\x0B', // Vertical tab + '\x0C', // Form feed +]; + +/// Same as [WHITESPACE_CHARACTERS], but formatted as a string for use in some `nom` APIs +const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; + +/// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing +/// the various encoding elements of a text Ion stream. +/// +/// Parsing methods have names that begin with `match_` and each return a `(match, remaining_input)` +/// pair. The `match` may be either the slice of the input that was matched (represented as another +/// `TextBufferView`) or a `MatchedValue` that retains information discovered during parsing that +/// will be useful if the match is later fully materialized into a value. +#[derive(PartialEq, Clone, Copy)] +pub(crate) struct TextBufferView<'a> { + // `data` is a slice of remaining data in the larger input stream. + // `offset` is the absolute position in the overall input stream where that slice begins. + // + // input: 00 01 02 03 04 05 06 07 08 09 + // └────┬────┘ + // data: &[u8] + // offset: 6 + data: &'a [u8], + offset: usize, +} + +pub(crate) type ParseResult<'a, T> = IonResult<(T, TextBufferView<'a>)>; + +impl<'data> TextBufferView<'data> { + /// Constructs a new `TextBufferView` that wraps `data`, setting the view's `offset` to zero. + #[inline] + pub fn new(data: &[u8]) -> TextBufferView { + Self::new_with_offset(data, 0) + } + + /// Constructs a new `TextBufferView` that wraps `data`, setting the view's `offset` to the + /// specified value. This is useful when `data` is a slice from the middle of a larger stream. + /// Note that `offset` is the index of the larger stream at which `data` begins and not an + /// offset _into_ `data`. + pub fn new_with_offset(data: &[u8], offset: usize) -> TextBufferView { + TextBufferView { data, offset } + } + + /// Returns a subslice of the [`TextBufferView`] that starts at `offset` and continues for + /// `length` bytes. + /// + /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the + /// larger stream of which the buffer is a piece. + pub fn slice(&self, offset: usize, length: usize) -> TextBufferView<'data> { + TextBufferView { + data: &self.data[offset..offset + length], + offset: self.offset + offset, + } + } + + /// Returns a subslice of the [`TextBufferView`] that starts at `offset` and continues + /// to the end. + /// + /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the + /// larger stream of which the buffer is a piece. + pub fn slice_to_end(&self, offset: usize) -> TextBufferView<'data> { + TextBufferView { + data: &self.data[offset..], + offset: self.offset + offset, + } + } + + /// Returns a slice containing all of the buffer's bytes. + pub fn bytes(&self) -> &[u8] { + self.data + } + + /// Returns the number of bytes between the start of the original input byte array and the + /// subslice of that byte array that this `TextBufferView` represents. + pub fn offset(&self) -> usize { + self.offset + } + + /// Returns the number of bytes in the buffer. + pub fn len(&self) -> usize { + self.data.len() + } + + /// Returns `true` if there are no bytes in the buffer. Otherwise, returns `false`. + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + pub fn match_whitespace(self) -> IonMatchResult<'data> { + is_a(WHITESPACE_CHARACTERS_AS_STR)(self) + } + + /// Always succeeds and consumes none of the input. Returns an empty slice of the buffer. + // This method is useful for parsers that need to match an optional construct but don't want + // to return an Option<_>. For an example, see its use in `match_optional_whitespace`. + fn match_nothing(self) -> IonMatchResult<'data> { + // Use nom's `success` parser to return an empty slice from the head position + success(self.slice(0, 0))(self) + } + + /// Matches zero or more whitespace characters. + pub fn match_optional_whitespace(self) -> IonMatchResult<'data> { + // Either match whitespace and return what follows or just return the input as-is. + // This will always return `Ok`, but it is packaged as an IonMatchResult for compatability + // with other parsers. + alt((Self::match_whitespace, Self::match_nothing))(self) + } + + /// Matches a single top-level scalar value, the beginning of a container, or an IVM. + pub fn match_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { + let (remaining, value) = match self.match_value() { + Ok(value) => value, + Err(e) => return Err(e), + }; + + // TODO: Augment this method to take an `is_complete` flag that indicates whether the absence + // of further values should return an `Incomplete` or a `RawStreamItem::EndOfStream`. + + // TODO: Check to see if `value` is actually an IVM. + // => If it's a symbol, try the IVM parser on it and see if it succeeds. + // For now, we just return the value. + Ok((remaining, RawStreamItem::Value(value))) + } + + /// Matches a single scalar value or the beginning of a container. + pub fn match_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> { + alt(( + // For `null` and `bool`, we use `read_` instead of `match_` because there's no additional + // parsing to be done. + map(match_and_length(Self::read_null), |(ion_type, length)| { + EncodedTextValue::new(MatchedValue::Null(ion_type), self.offset(), length) + }), + map(match_and_length(Self::read_bool), |(value, length)| { + EncodedTextValue::new(MatchedValue::Bool(value), self.offset(), length) + }), + // For `int` and the other types, we use `match` and store the partially-processed input in the + // `matched_value` field of the `EncodedTextValue` we return. + map( + match_and_length(Self::match_int), + |(matched_int, length)| { + EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length) + }, + ), + // TODO: The other Ion types + )) + .map(|encoded_value| LazyRawTextValue { + encoded_value, + input: self, + }) + .parse(self) + } + + /// Matches a boolean value. + pub fn match_bool(self) -> IonMatchResult<'data> { + recognize(Self::read_bool)(self) + } + + /// Matches and returns a boolean value. + pub fn read_bool(self) -> IonParseResult<'data, bool> { + terminated( + alt((value(true, tag("true")), value(false, tag("false")))), + Self::peek_stop_character, + )(self) + } + + /// Matches any type of null. (`null`, `null.null`, `null.int`, etc) + pub fn match_null(self) -> IonMatchResult<'data> { + recognize(Self::read_null)(self) + } + + /// Matches and returns a null value. + pub fn read_null(self) -> IonParseResult<'data, IonType> { + delimited( + tag("null"), + opt(preceded(char('.'), Self::read_ion_type)), + Self::peek_stop_character, + ) + .map(|explicit_ion_type| explicit_ion_type.unwrap_or(IonType::Null)) + .parse(self) + } + + /// Matches and returns an Ion type. + fn read_ion_type(self) -> IonParseResult<'data, IonType> { + alt(( + value(IonType::Null, tag("null")), + value(IonType::Bool, tag("bool")), + value(IonType::Int, tag("int")), + value(IonType::Float, tag("float")), + value(IonType::Decimal, tag("decimal")), + value(IonType::Timestamp, tag("timestamp")), + value(IonType::Symbol, tag("symbol")), + value(IonType::String, tag("string")), + value(IonType::Clob, tag("clob")), + value(IonType::Blob, tag("blob")), + value(IonType::List, tag("list")), + value(IonType::SExp, tag("sexp")), + value(IonType::Struct, tag("struct")), + ))(self) + } + + /// Matches any one of Ion's stop characters. + fn match_stop_character(self) -> IonMatchResult<'data> { + recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}")).parse(self) + } + + /// Matches--but does not consume--any one of Ion's stop characters. + fn peek_stop_character(self) -> IonMatchResult<'data> { + peek(Self::match_stop_character).parse(self) + } + + /// Matches the three parts of an int--its base, its sign, and its digits--without actually + /// constructing an Int from them. + fn match_int(self) -> IonParseResult<'data, MatchedInt> { + terminated( + // We test for base 16 and base 2 so the '0x' or '0b' isn't confused for a leading zero + // in a base 10 number, which would be illegal. + alt(( + Self::match_base_2_int, + Self::match_base_16_int, + Self::match_base_10_int, + )), + Self::peek_stop_character, + )(self) + } + + /// Matches a base-2 notation integer (e.g. `0b0`, `0B1010`, or `-0b0111`) and returns the + /// partially parsed value as a [`MatchedInt`]. + fn match_base_2_int(self) -> IonParseResult<'data, MatchedInt> { + separated_pair( + opt(char('-')), + alt((tag("0b"), tag("0B"))), + Self::match_base_2_int_digits, + ) + .map(|(maybe_sign, digits)| { + MatchedInt::new(2, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits of a base-2 integer. + fn match_base_2_int_digits(self) -> IonMatchResult<'data> { + recognize(terminated( + // Zero or more digits-followed-by-underscores + many0_count(pair(is_a("01"), char('_'))), + // One or more digits + is_a("01"), + ))(self) + } + + /// Matches a base-10 notation integer (e.g. `0`, `255`, or `-1_024`) and returns the partially + /// parsed value as a [`MatchedInt`]. + fn match_base_10_int(self) -> IonParseResult<'data, MatchedInt> { + pair(opt(char('-')), Self::match_base_10_int_digits) + .map(|(maybe_sign, digits)| { + MatchedInt::new(10, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits of a base-10 integer. (i.e. An integer without a sign.) + fn match_base_10_int_digits(self) -> IonMatchResult<'data> { + alt(( + // The number is either a zero... + recognize(char('0')), + // Or it's a non-zero followed by some number of '_'-separated digits + Self::match_base_10_digits_before_dot, + ))(self) + } + + /// Matches either: + /// * a zero + /// * a non-zero followed by some number of digits with optional underscores + fn match_base_10_digits_before_dot(self) -> IonMatchResult<'data> { + alt(( + tag("0"), + recognize(pair( + Self::match_base_10_leading_digit, + Self::match_base_10_trailing_digits, + )), + ))(self) + } + + /// Matches the first digit of a multi-digit base-10 integer. (i.e. Any digit but zero.) + fn match_base_10_leading_digit(self) -> IonMatchResult<'data> { + recognize(one_of("123456789"))(self) + } + + /// Matches any number of digits with underscores optionally appearing in the middle. + /// This parser accepts leading zeros, which is why it cannot be used for the beginning + /// of a number. + fn match_base_10_trailing_digits(self) -> IonMatchResult<'data> { + recognize(many0_count(pair(opt(char('_')), digit1)))(self) + } + + /// Matches a base-10 notation integer (e.g. `0x0`, `0X20`, or `-0xCAFE`) and returns the + /// partially parsed value as a [`MatchedInt`]. + fn match_base_16_int(self) -> IonParseResult<'data, MatchedInt> { + separated_pair( + opt(char('-')), + alt((tag("0x"), tag("0X"))), + Self::match_base_16_int_trailing_digits, + ) + .map(|(maybe_sign, digits)| { + MatchedInt::new(16, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits that follow the '0x' or '0X' in a base-16 integer + fn match_base_16_int_trailing_digits(self) -> IonMatchResult<'data> { + recognize(terminated( + // Zero or more digits-followed-by-underscores + many0_count(pair(Self::take_base_16_digits1, char('_'))), + // One or more digits + Self::take_base_16_digits1, + ))(self) + } + + /// Recognizes 1 or more consecutive base-16 digits. + // This function's "1" suffix is a style borrowed from `nom`. + fn take_base_16_digits1(self) -> IonMatchResult<'data> { + take_while1(|b: u8| b.is_ascii_hexdigit())(self) + } +} + +// === nom trait implementations === +// The trait implementations that follow are necessary for `TextBufferView` to be used as an input +// type in `nom` parsers. (`nom` only supports `&str` and `&[u8]` out of the box.) Defining our own +// input type makes it possible for us to carry around additional context during the parsing process, +// which is important for providing helpful error messages. For example: we can include the absolute +// offset of the input slice currently being read in our error messages. +// +// As `TextBufferView` is just a wrapper around a `&[u8]`, these implementations mostly delegate +// to the existing trait impls for `&[u8]`. + +impl<'data> nom::InputTake for TextBufferView<'data> { + fn take(&self, count: usize) -> Self { + self.slice(0, count) + } + + fn take_split(&self, count: usize) -> (Self, Self) { + let (before, after) = self.data.split_at(count); + let buffer_before = TextBufferView::new_with_offset(before, self.offset()); + let buffer_after = TextBufferView::new_with_offset(after, self.offset() + count); + // Nom's convention is to place the remaining portion of the buffer first, which leads to + // a potentially surprising reversed tuple order. + (buffer_after, buffer_before) + } +} + +impl<'data> nom::InputLength for TextBufferView<'data> { + fn input_len(&self) -> usize { + self.len() + } +} + +impl<'data> nom::InputIter for TextBufferView<'data> { + type Item = u8; + type Iter = Enumerate; + type IterElem = Copied>; + + fn iter_indices(&self) -> Self::Iter { + self.iter_elements().enumerate() + } + + fn iter_elements(&self) -> Self::IterElem { + self.data.iter().copied() + } + + fn position

(&self, predicate: P) -> Option + where + P: Fn(Self::Item) -> bool, + { + self.data.iter().position(|b| predicate(*b)) + } + + fn slice_index(&self, count: usize) -> Result { + self.data.slice_index(count) + } +} + +impl<'a, 'b> nom::Compare<&'a str> for TextBufferView<'b> { + fn compare(&self, t: &'a str) -> CompareResult { + self.data.compare(t.as_bytes()) + } + + fn compare_no_case(&self, t: &'a str) -> CompareResult { + self.data.compare_no_case(t.as_bytes()) + } +} + +impl<'data> nom::Offset for TextBufferView<'data> { + fn offset(&self, second: &Self) -> usize { + self.data.offset(second.data) + } +} + +impl<'data> nom::Slice> for TextBufferView<'data> { + fn slice(&self, range: RangeFrom) -> Self { + self.slice_to_end(range.start) + } +} + +impl<'data> nom::Slice> for TextBufferView<'data> { + fn slice(&self, range: RangeTo) -> Self { + self.slice(0, range.end) + } +} + +impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { + type Item = u8; + + fn split_at_position>(&self, predicate: P) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(i) => Ok(self.take_split(i)), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + fn split_at_position1>( + &self, + predicate: P, + e: ErrorKind, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), + Some(i) => Ok(self.take_split(i)), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + fn split_at_position_complete>( + &self, + predicate: P, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(i) => Ok(self.take_split(i)), + None => Ok(self.take_split(self.input_len())), + } + } + + fn split_at_position1_complete>( + &self, + predicate: P, + e: ErrorKind, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), + Some(i) => Ok(self.take_split(i)), + None => { + if self.is_empty() { + Err(nom::Err::Error(E::from_error_kind(*self, e))) + } else { + Ok(self.take_split(self.input_len())) + } + } + } + } +} + +// === end of `nom` trait implementations + +/// Augments a given parser such that it returns the matched value and the number of input bytes +/// that it matched. +fn match_and_length<'data, P, O>( + mut parser: P, +) -> impl Parser, (O, usize), IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + move |input: TextBufferView<'data>| { + let offset_before = input.offset(); + let (remaining, matched) = match parser.parse(input) { + Ok((remaining, matched)) => (remaining, matched), + Err(e) => return Err(e), + }; + let offset_after = remaining.offset(); + let match_length = offset_after - offset_before; + Ok((remaining, (matched, match_length))) + } +} + +/// Returns the number of bytes that the provided parser matched. +fn match_length<'data, P, O>( + parser: P, +) -> impl Parser, usize, IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + // Call `match_and_length` and discard the output + match_and_length(parser).map(|(_output, match_length)| match_length) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Stores an input string that can be tested against a given parser. + struct MatchTest { + input: String, + } + + impl MatchTest { + /// Takes an `input` string and appends a trailing space to it, guaranteeing that the + /// contents of the input are considered a complete token. + fn new(input: &str) -> Self { + MatchTest { + input: format!("{input} "), // add trailing space + } + } + + fn try_match<'data, P, O>(&'data self, parser: P) -> IonParseResult<'data, usize> + where + P: Parser, O, IonParseError<'data>>, + { + let buffer = TextBufferView::new(self.input.as_bytes()); + match_length(parser).parse(buffer) + } + + fn expect_match<'data, P, O>(&'data self, parser: P) + where + P: Parser, O, IonParseError<'data>>, + { + let result = self.try_match(parser); + let (_remaining, match_length) = result.unwrap(); + // Inputs have a trailing space that should _not_ be part of the match + assert_eq!( + match_length, + self.input.len() - 1, + "\nInput: '{}'\nMatched: '{}'\n", + self.input, + &self.input[..match_length] + ); + } + + fn expect_mismatch<'data, P, O>(&'data self, parser: P) + where + P: Parser, O, IonParseError<'data>>, + { + let result = self.try_match(parser); + // We expect this to fail for one reason or another + result.unwrap_err(); + } + } + + #[test] + fn test_match_stop_char() { + MatchTest::new(" ").expect_match(match_length(TextBufferView::match_stop_character)); + } + + #[test] + fn test_match_bool() { + fn match_bool(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_bool)); + } + fn mismatch_bool(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_bool)); + } + + match_bool("true"); + match_bool("false"); + + mismatch_bool("True"); + mismatch_bool("TRUE"); + mismatch_bool("False"); + mismatch_bool("FALSE"); + mismatch_bool("potato"); + mismatch_bool("42"); + } + + #[test] + fn test_match_null() { + fn match_null(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_null)); + } + fn mismatch_null(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_null)); + } + let good_inputs = &[ + "null", + "null.null", + "null.bool", + "null.int", + "null.float", + "null.decimal", + "null.timestamp", + "null.symbol", + "null.string", + "null.clob", + "null.blob", + "null.list", + "null.sexp", + "null.struct", + ]; + for input in good_inputs { + match_null(input); + } + + let bad_inputs = &[ + "-1", + "null.hello", + "nullnull", + "nullify", + "null..int", + "string.null", + ]; + for input in bad_inputs { + mismatch_null(input); + } + } + + #[test] + fn test_match_int() { + fn match_int(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_int)); + } + fn mismatch_int(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_int)); + } + let good_inputs = &[ + // Base 2 integers + "0b0", + "0B0", + "0b1", + "0B1", + "0b0001", + "0B1111", + "0b1111_1111", + "0B1010_1010", + // Base 10 integers + "0", + "13", + "942", + "7_216", + "1_000_000", + "9_999_999", + // Base 16 integers + "0x0", + "0x20", + "0x0A", + "0xcafe", + "0xCAFE", + "0XcaFE", + "0xC_A_F_E", + "0Xca_FE", + ]; + for input in good_inputs { + match_int(input); + let negative = format!("-{input}"); + match_int(&negative); + } + + let bad_inputs = &[ + "00", // Zero with leading zero + "0123", // Non-zero with leading zero + "--5", // Double negative + "+5", // Explicit positive + "1__000__000", // More than one underscore at a time + "_123", // Leading underscore + "0x0x5", // Multiple 0x prefixes + "0xx5", // Multiple Xs after 0 + "0x", // Base 16 prefix w/no number + "0b", // Base 2 prefix w/no number + ]; + for input in bad_inputs { + mismatch_int(input); + } + } +} diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs new file mode 100644 index 00000000..e1a3bcc1 --- /dev/null +++ b/src/lazy/text/encoded_value.rs @@ -0,0 +1,207 @@ +use crate::lazy::text::matched::MatchedValue; +use crate::IonType; +use std::ops::Range; + +/// Represents the type, offset, and length metadata of the various components of an encoded value +/// in a text input stream. +/// +/// Each [`LazyRawTextValue`](crate::lazy::text::value::LazyRawTextValue) contains an `EncodedValue`, +/// allowing a user to re-read (that is: parse) the body of the value as many times as necessary +/// without re-parsing its header information each time. +#[derive(Clone, Copy, Debug, PartialEq)] +pub(crate) struct EncodedTextValue { + // Each encoded text value has up to three components, appearing in the following order: + // + // [ field_name? | annotations? | data ] + // + // Components shown with a `?` are optional. + + // The following is an example encoding of a struct field with an annotated value-- the only kind + // of Ion value that has both of the optional components--that appears 5 gigabytes into the input + // stream: + // + // ┌─── field_name_offset: 12 + // │ ┌─── annotations_offset: 5 + // │ │ ┌─── data_offset: 5_000_000_012 + // price: USD::55.99, + // └─┬─┘ └─┬─┘└─┬─┘ + // │ │ └─ data_length: 5 + // │ └─ annotations_length: 5 + // └─ field_name_length: 5 + // + // Notice that only `data_offset` is an absolute offset from the beginning of the stream; + // this is because `data` is the only field that is always guaranteed to be present. + // `field_name_offset` and `annotations_offset` are stored as the number of bytes _before_ + // `data_offset`, allowing them to be stored in fewer bytes. + + // The absolute position (in bytes) of this value's `data` component within the overall stream + // being decoded. + data_offset: usize, + // The number of bytes _before_ `data_offset` at which the field name begins. If this value + // does not have a field name, this value will be zero. + field_name_offset: u32, + // The number of bytes _before_ `data_offset` at which the annotations sequence begins. + // If this value does not have a field name, this value will be zero. + annotations_offset: u32, + + // The number of bytes used to encode the data component of this Ion value. + data_length: usize, + // The number of bytes used to encode the field name preceding the data, if any. + // If there is no field name (i.e. the value is not inside a struct), this will be zero. + // If there is whitespace before the field name, this will not include it. + field_name_length: u32, + // The number of bytes used to encode the annotations sequence preceding the data, if any. + // If there is no annotations sequence, this will be zero. // If there is whitespace before the + // annotations sequence, this will not include it. + annotations_length: u32, + + // Information that was recorded about the value as it was being matched. + // For some types (e.g. bool), matching the text is the complete parsing process so the whole + // value is stored. For others (e.g. a timestamp), the various components of the value are + // recognized during matching and partial information like subfield offsets can be stored here. + matched_value: MatchedValue, +} + +impl EncodedTextValue { + pub(crate) fn new( + matched_value: MatchedValue, + offset: usize, + length: usize, + ) -> EncodedTextValue { + EncodedTextValue { + data_offset: offset, + data_length: length, + field_name_length: 0, + field_name_offset: 0, + annotations_offset: 0, + annotations_length: 0, + matched_value, + } + } + + // The field name range should contain the field name literal itself without any trailing + // whitespace or the delimiting ':'. + // Examples: + // foo + // 'foo' + // "foo" + // $10 + pub(crate) fn with_field_name(mut self, offset: usize, length: usize) -> EncodedTextValue { + self.field_name_offset = (self.data_offset - offset) as u32; + self.field_name_length = length as u32; + self + } + + // The annotations should include all of the symbol tokens, their delimiting '::'s, and any + // interstitial whitespace. It should not include any leading/trailing whitespace or the value + // itself. + // Examples: + // foo::bar:: + // 'foo'::'bar':: + // foo :: 'bar' :: + pub(crate) fn with_annotations_sequence( + mut self, + offset: usize, + length: usize, + ) -> EncodedTextValue { + self.annotations_offset = (self.data_offset - offset) as u32; + self.annotations_length = length as u32; + self + } + + pub fn ion_type(&self) -> IonType { + match self.matched_value { + MatchedValue::Null(ion_type) => ion_type, + MatchedValue::Bool(_) => IonType::Bool, + MatchedValue::Int(_) => IonType::Int, + } + } + + pub fn is_null(&self) -> bool { + matches!(self.matched_value, MatchedValue::Null(_)) + } + + pub fn data_length(&self) -> usize { + self.data_length + } + + pub fn data_range(&self) -> Range { + self.data_offset..(self.data_offset + self.data_length) + } + + pub fn field_name_range(&self) -> Option> { + if self.field_name_offset == 0 { + return None; + } + let start = self.data_offset - (self.field_name_offset as usize); + let end = start + (self.field_name_length as usize); + Some(start..end) + } + + pub fn annotations_range(&self) -> Option> { + if self.annotations_offset == 0 { + return None; + } + let start = self.data_offset - (self.annotations_offset as usize); + let end = start + (self.annotations_length as usize); + Some(start..end) + } + + pub fn has_field_name(&self) -> bool { + self.field_name_offset > 0 + } + + pub fn has_annotations(&self) -> bool { + self.annotations_offset > 0 + } + + /// Returns the total number of bytes used to represent the current value, including the + /// field ID (if any), its annotations (if any), its header (type descriptor + length bytes), + /// and its value. + pub fn total_length(&self) -> usize { + self.data_length + u32::max(self.annotations_offset, self.field_name_offset) as usize + } + + pub fn matched(&self) -> MatchedValue { + self.matched_value + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn total_length_data_only() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12); + assert_eq!(value.total_length(), 12); + } + + #[test] + fn total_length_data_with_field_name() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(90, 4); + assert_eq!(value.total_length(), 22); + } + + #[test] + fn total_length_data_with_annotations() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_annotations_sequence(90, 4); + assert_eq!(value.total_length(), 22); + } + + #[test] + fn total_length_data_with_field_name_and_annotations() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(90, 4) + .with_annotations_sequence(94, 6); + assert_eq!(value.total_length(), 22); + + // Same test but with extra whitespace between the components + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(80, 4) + .with_annotations_sequence(91, 6); + assert_eq!(value.total_length(), 32, "{:?}", value); + } +} diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs new file mode 100644 index 00000000..b3f79056 --- /dev/null +++ b/src/lazy/text/matched.rs @@ -0,0 +1,109 @@ +//! Types in this module represent partially parsed values from the text Ion input stream. +//! +//! Ion readers are not necessarily interested in every value in the input. While the binary reader +//! is able to skip over uninteresting values using their length prefix, text readers must parse +//! every value in the stream in order to access the ones that follow. +//! +//! A somewhat naive implementation of a text reader might fully read each value in the input +//! stream eagerly, simply discarding values that the user doesn't request. This approach is +//! technically correct, but incurs the expense of validating and materializing data that will +//! ultimately be ignored. (As an example: consider a timestamp, which can have up to ~9 subfields +//! to check for syntactic and semantic correctness.) +//! +//! In contrast, when the lazy text reader is asked for the `next()` value in the stream, it uses its +//! Ion parser to identify the next slice of input that contains either a complete scalar value or +//! the beginning of a container. It stores an intermediate representation (IR) of that value using +//! one of the types defined in this module. The IR stores the value's Ion type, subfield offsets, +//! and other information that is identified in the process of parsing the next value. Later, if the +//! application asks to `read()` the value, the reader does not have to start from scratch. It can +//! use the previously recorded information to minimize the amount of information that needs to be +//! re-discovered. + +use crate::lazy::text::as_utf8::AsUtf8; +use crate::lazy::text::buffer::TextBufferView; +use crate::result::IonFailure; +use crate::{Int, IonResult, IonType}; +use num_bigint::BigInt; +use num_traits::Num; +use smallvec::SmallVec; +use std::num::IntErrorKind; + +/// A partially parsed Ion value. +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) enum MatchedValue { + // `Null` and `Bool` are fully parsed because they only involve matching a keyword. + Null(IonType), + Bool(bool), + Int(MatchedInt), + // TODO: ...the other types +} + +/// A partially parsed Ion int. +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) struct MatchedInt { + radix: u32, + digits_offset: usize, + is_negative: bool, +} + +impl MatchedInt { + // Integers that take more than 32 bytes to represent will heap allocate a larger buffer. + const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; + + /// Constructs a new `MatchedInt`. + pub fn new(radix: u32, is_negative: bool, digits_offset: usize) -> Self { + Self { + radix, + digits_offset, + is_negative, + } + } + + /// Whether the partially parsed int began with a `-` + pub fn is_negative(&self) -> bool { + self.is_negative + } + + /// One of: `2`, `10`, or `16`, as determined by whether the partially parsed integer began + /// with a `0b`/`0B`, `0x`/`0X`, or no prefix. + pub fn radix(&self) -> u32 { + self.radix + } + + /// Attempts to finish reading the partially parsed integer. + pub fn read(&self, matched_input: TextBufferView) -> IonResult { + let digits = matched_input.slice_to_end(self.digits_offset); + let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> = + SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY); + // Copy the input text over to the sanitization buffer, discarding any underscores. These + // are legal input, but Rust's integer `from_str_radix` method does not support them. + sanitized.extend(digits.bytes().iter().copied().filter(|b| *b != b'_')); + // Note: This UTF-8 validation step should be unnecessary as the parser only recognizes + // ASCII integer characters. If this shows up in profiling, we could consider skipping it. + let text = sanitized.as_utf8(matched_input.offset())?; + let int: Int = match i64::from_str_radix(text, self.radix()) { + Ok(i) => i.into(), + Err(parse_int_error) => { + debug_assert!( + // `from_str_radix` can fail for a variety of reasons, but our rules for matching an + // int rule out most of them (empty str, invalid digit, etc). The only ones that should + // happen are overflow and underflow. In those cases, we fall back to using `BigInt`. + parse_int_error.kind() == &IntErrorKind::NegOverflow + || parse_int_error.kind() == &IntErrorKind::PosOverflow + ); + + match BigInt::from_str_radix(text, self.radix()) { + Ok(big_int) => big_int.into(), + Err(_big_parse_int_error) => { + return IonResult::decoding_error(format!( + "unexpected error while parsing int: '{}'", + std::str::from_utf8(matched_input.bytes()).unwrap_or("invalid UTF-8") + )) + } + } + } + }; + + Ok(int) + } +} diff --git a/src/lazy/text/mod.rs b/src/lazy/text/mod.rs new file mode 100644 index 00000000..a9a2cea2 --- /dev/null +++ b/src/lazy/text/mod.rs @@ -0,0 +1,7 @@ +mod as_utf8; +pub mod buffer; +pub mod encoded_value; +pub mod matched; +pub mod parse_result; +pub mod raw; +pub mod value; diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs new file mode 100644 index 00000000..7da90511 --- /dev/null +++ b/src/lazy/text/parse_result.rs @@ -0,0 +1,274 @@ +//! The [`nom` parser combinator crate](https://docs.rs/nom/latest/nom/) intentionally provides +//! bare-bones error reporting by default. Each error contains only a `&str` representing the input +//! that could not be matched and an [`ErrorKind`] enum variant indicating which `nom` parser produced +//! the error. This stack-allocated type is very cheap to create, which is important because a +//! typical parse will require creating large numbers of short-lived error values. +//! +//! This module defines `IonParseError`, a custom error type that can capture more information than is +//! supported by [`nom::error::Error`]. It also defines `IonParseResult`, a type alias for an +//! [`IResult`] that parses `TextBufferView`s and produces `IonParseError`s if something goes wrong. + +use crate::lazy::text::buffer::TextBufferView; +use crate::position::Position; +use crate::result::{DecodingError, IonFailure}; +use crate::{IonError, IonResult}; +use nom::error::{Error as NomError, ErrorKind, ParseError}; +use nom::{Err, IResult}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; + +/// A type alias for a [`IResult`] whose input is a `TextBufferView` and whose error type is an +/// [`InvalidInputError`]. All of the Ion parsers in the `text::parsers` module return an +/// [`IonParseResult`]. +/// +/// If the parser is successful, it will return `Ok(output_value)`. If it encounters a problem, +/// it will return a `nom::Err`. [nom::Err] is a generic enum with three possible +/// variants: +/// 1. `Incomplete(_)` indicates that there wasn't enough input data to determine whether the +/// parser should match or not. +/// 2. `Error(ion_parse_error)` indicates that the parser did not match the input text. +/// 3. `Failure(ion_parse_error)` indicates that the parser matched the text but encountered +/// a problem when trying to materialize it into the `output_value`. In such cases, returning a +/// `Failure` signals that this was the correct parser to handle the input but it could not +/// be processed successfully for some reason. For example, a parser trying to match a number of +/// hours and minutes might match the text `11:71`, but fail when it tries to turn `71` into a +/// number of minutes because it's `>=60`. We know this was the right parser, but it wasn't +/// able to process it. (This is slightly contrived; it would be possible to write a parser +/// that rejected `71` as a number of minutes based on syntax alone.) +pub(crate) type IonParseResult<'a, O> = IResult, O, IonParseError<'a>>; +// Functions that return IonParseResult parse TextBufferView-^ ^ ^ +// ...return a value of type `O` -----+ | +// ...or a nom::Err if something goes wrong ----+ + +/// As above, but for parsers that simply identify (i.e. 'match') a slice of the input as a +/// particular item. +pub(crate) type IonMatchResult<'a> = + IResult, TextBufferView<'a>, IonParseError<'a>>; + +#[derive(Debug, PartialEq)] +pub enum IonParseError<'data> { + // When nom reports that the data was incomplete, it doesn't provide additional context. + Incomplete, + // When we encounter illegal text Ion, we'll have more information to share with the user. + Invalid(InvalidInputError<'data>), +} + +/// Describes a problem that occurred while trying to parse a given input `TextBufferView`. +/// +/// When returned as part of an `IonParseResult`, an `IonParseError` is always wrapped in +/// a [nom::Err] (see `IonParseResult`'s documentation for details). If the `nom::Err` is +/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `nom::Err` is +/// a fatal `Failure`, the `description` will be `Some(String)`. In this way, using an +/// `IonParseError` only incurs heap allocation costs when parsing is coming to an end. +#[derive(Debug, PartialEq)] +pub struct InvalidInputError<'data> { + // The input that being parsed when the error arose + input: TextBufferView<'data>, + // A human-friendly name for what the parser was working on when the error occurred + label: Option>, + // The nature of the error--what went wrong? + description: Option>, + // A backtrace of errors that occurred leading to this one. + // XXX: This is the most expensive part of error handling and is likely not very useful. + // Consider removing it if it doesn't carry its weight. + backtrace: Vec>, + // The nom ErrorKind, which indicates which nom-provided parser encountered the error we're + // bubbling up. + nom_error_kind: Option, +} + +impl<'data> InvalidInputError<'data> { + /// Constructs a new `IonParseError` from the provided `input` text. + pub(crate) fn new(input: TextBufferView<'data>) -> Self { + InvalidInputError { + input, + label: None, + description: None, + nom_error_kind: None, + backtrace: Vec::new(), + } + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_label>>(mut self, label: D) -> Self { + self.label = Some(label.into()); + self + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_description>>(mut self, description: D) -> Self { + self.description = Some(description.into()); + self + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_nom_error_kind(mut self, nom_error_kind: ErrorKind) -> Self { + self.nom_error_kind = Some(nom_error_kind); + self + } + + pub(crate) fn append_error(&mut self, error: InvalidInputError<'data>) { + self.backtrace.push(error) + } + + /// Returns a reference to the `description` text, if any. + pub fn description(&self) -> Option<&str> { + self.description.as_deref() + } + + pub fn label(&self) -> Option<&str> { + self.label.as_deref() + } + + // TODO: Decide how to expose 'input'. +} + +// impl<'data> From> for IonError { +// fn from(value: InvalidInputError) -> Self { +// dbg!(&value.backtrace); +// let mut message = String::from(value.description().unwrap_or("invalid text Ion syntax")); +// if let Some(label) = value.label { +// message.push_str(" while "); +// message.push_str(label.as_ref()); +// } +// let position = Position::with_offset(value.input.offset()).with_length(value.input.len()); +// let decoding_error = DecodingError::new(message).with_position(position); +// IonError::Decoding(decoding_error) +// } +// } + +impl<'data> From> for IonParseError<'data> { + fn from(value: InvalidInputError<'data>) -> Self { + IonParseError::Invalid(value) + } +} + +impl<'data> From>> for IonParseError<'data> { + fn from(value: Err>) -> Self { + match value { + Err::Incomplete(_) => IonParseError::Incomplete, + Err::Error(e) => e, + Err::Failure(e) => e, + } + } +} + +/// Allows an `IonParseError` to be constructed from a `(&str, ErrorKind)` tuple, which is the +/// data provided by core `nom` parsers if they do not match the input. +impl<'data> From<(TextBufferView<'data>, ErrorKind)> for IonParseError<'data> { + fn from((input, error_kind): (TextBufferView<'data>, ErrorKind)) -> Self { + InvalidInputError::new(input) + .with_nom_error_kind(error_kind) + .into() + } +} + +/// Allows a [nom::error::Error] to be converted into an [IonParseError] by calling `.into()`. +impl<'data> From>> for IonParseError<'data> { + fn from(nom_error: NomError>) -> Self { + InvalidInputError::new(nom_error.input) + .with_nom_error_kind(nom_error.code) + .into() + } +} + +/// Allows `IonParseError` to be used as the error type in various `nom` functions. +impl<'data> ParseError> for IonParseError<'data> { + fn from_error_kind(input: TextBufferView<'data>, error_kind: ErrorKind) -> Self { + InvalidInputError::new(input) + .with_nom_error_kind(error_kind) + .into() + } + + fn append(input: TextBufferView<'data>, kind: ErrorKind, mut other: Self) -> Self { + // When an error stack is being built, this method is called to give the error + // type an opportunity to aggregate the errors into a collection or a more descriptive + // message. For now, we simply allow the most recent error to take precedence. + let new_error = InvalidInputError::new(input).with_nom_error_kind(kind); + if let IonParseError::Invalid(invalid_input_error) = &mut other { + invalid_input_error.backtrace.push(new_error) + } + other + } +} + +pub(crate) trait AddContext<'data, T> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)>; +} + +impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)> { + match self { + // No change needed in the ok case + Ok(matched) => Ok(matched), + // If the error was an incomplete + Err(e) => { + // Nom error to IonParseError + match IonParseError::from(e) { + IonParseError::Incomplete => IonResult::incomplete(label, input.offset()), + IonParseError::Invalid(invalid_input_error) => { + dbg!(&invalid_input_error.backtrace); + let mut message = String::from( + invalid_input_error + .description() + .unwrap_or("invalid text Ion syntax"), + ); + if let Some(label) = invalid_input_error.label { + message.push_str(" while "); + message.push_str(label.as_ref()); + } + let position = Position::with_offset(invalid_input_error.input.offset()) + .with_length(invalid_input_error.input.len()); + let decoding_error = DecodingError::new(message).with_position(position); + Err(IonError::Decoding(decoding_error)) + } + } + } + } + } +} + +/// Constructs a `nom::Err::Failure` that contains an `IonParseError` describing the problem +/// that was encountered. +pub(crate) fn fatal_parse_error>, O>( + input: TextBufferView, + description: D, +) -> IonParseResult { + Err(nom::Err::Failure( + InvalidInputError::new(input) + .with_description(description) + .into(), + )) +} + +/// An extension trait that allows a [std::result::Result] of any kind to be mapped to an +/// `IonParseResult` concisely. +pub(crate) trait OrFatalParseError { + fn or_fatal_parse_error(self, input: TextBufferView, label: L) + -> IonParseResult; +} + +/// See the documentation for [OrFatalParseError]. +impl OrFatalParseError for Result +where + E: Debug, +{ + fn or_fatal_parse_error( + self, + input: TextBufferView, + label: L, + ) -> IonParseResult { + match self { + Ok(value) => Ok((input, value)), + Err(error) => fatal_parse_error(input, format!("{label}: {error:?}")), + } + } +} diff --git a/src/lazy/text/raw/mod.rs b/src/lazy/text/raw/mod.rs new file mode 100644 index 00000000..1077754f --- /dev/null +++ b/src/lazy/text/raw/mod.rs @@ -0,0 +1 @@ +pub mod reader; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs new file mode 100644 index 00000000..22eedf61 --- /dev/null +++ b/src/lazy/text/raw/reader.rs @@ -0,0 +1,133 @@ +use crate::lazy::decoder::LazyRawReader; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_stream_item::RawStreamItem; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::AddContext; +use crate::result::IonFailure; +use crate::IonResult; + +/// A text Ion 1.0 reader that yields [`RawStreamItem`]s representing the top level values found +/// in the provided input stream. +pub struct LazyRawTextReader<'data> { + // The current view of the data we're reading from. + buffer: TextBufferView<'data>, + // Each time something is parsed from the buffer successfully, the caller will mark the number + // of bytes that may be skipped the next time the reader advances. + bytes_to_skip: usize, +} + +impl<'data> LazyRawTextReader<'data> { + /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. + pub fn new(data: &'data [u8]) -> LazyRawTextReader<'data> { + Self::new_with_offset(data, 0) + } + + /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. + /// The provided input stream is itself a slice starting `offset` bytes from the beginning + /// of a larger data stream. This offset is used for reporting the absolute (stream-level) + /// position of values encountered in `data`. + fn new_with_offset(data: &'data [u8], offset: usize) -> LazyRawTextReader<'data> { + LazyRawTextReader { + buffer: TextBufferView::new_with_offset(data, offset), + bytes_to_skip: 0, + } + } + + pub fn next<'top>(&'top mut self) -> IonResult> + where + 'data: 'top, + { + let buffer = self.buffer; + if buffer.is_empty() { + return IonResult::incomplete("reading a top-level value", buffer.offset()); + } + let (buffer_after_whitespace, _whitespace) = buffer + .match_optional_whitespace() + .with_context("skipping whitespace between top-level values", buffer)?; + let (remaining, matched) = buffer_after_whitespace + .match_top_level() + .with_context("reading a top-level value", buffer_after_whitespace)?; + // If we successfully moved to the next value, store the remaining buffer view + self.buffer = remaining; + Ok(matched) + } +} + +impl<'data> LazyRawReader<'data, TextEncoding> for LazyRawTextReader<'data> { + fn new(data: &'data [u8]) -> Self { + LazyRawTextReader::new(data) + } + + fn next<'a>(&'a mut self) -> IonResult> { + self.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lazy::decoder::LazyRawValue; + use crate::IonType; + + #[test] + fn test_top_level() -> IonResult<()> { + let data = r#" + null + null.bool + null.int + false + true + 500 + 0x20 + 0b0101 + "#; + let mut reader = LazyRawTextReader::new(data.as_bytes()); + + // null + let lazy_untyped_null = reader.next()?.expect_value()?; + assert!(lazy_untyped_null.is_null()); + assert_eq!(lazy_untyped_null.ion_type(), IonType::Null); + + // null.bool + let lazy_null_bool = reader.next()?.expect_value()?; + assert!(lazy_null_bool.is_null()); + assert_eq!(lazy_null_bool.ion_type(), IonType::Bool); + + // null.int + let lazy_null_int = reader.next()?.expect_value()?; + assert!(lazy_null_int.is_null()); + assert_eq!(lazy_null_int.ion_type(), IonType::Int); + + // false + let lazy_bool_false = reader.next()?.expect_value()?; + assert!(!lazy_bool_false.is_null()); + assert_eq!(lazy_bool_false.ion_type(), IonType::Bool); + assert!(!lazy_bool_false.read()?.expect_bool()?); + + // true + let lazy_bool_true = reader.next()?.expect_value()?; + assert!(!lazy_bool_true.is_null()); + assert_eq!(lazy_bool_true.ion_type(), IonType::Bool); + assert!(lazy_bool_true.read()?.expect_bool()?); + + // 500 + let lazy_int_decimal_500 = reader.next()?.expect_value()?; + assert!(!lazy_int_decimal_500.is_null()); + assert_eq!(lazy_int_decimal_500.ion_type(), IonType::Int); + assert_eq!(lazy_int_decimal_500.read()?.expect_i64()?, 500); + + // 0x20 + let lazy_int_hex_20 = reader.next()?.expect_value()?; + assert!(!lazy_int_hex_20.is_null()); + assert_eq!(lazy_int_hex_20.ion_type(), IonType::Int); + assert_eq!(lazy_int_hex_20.read()?.expect_i64()?, 0x20); // decimal 32 + + // 0b0101 + let lazy_int_binary_0101 = reader.next()?.expect_value()?; + assert!(!lazy_int_binary_0101.is_null()); + assert_eq!(lazy_int_binary_0101.ion_type(), IonType::Int); + assert_eq!(lazy_int_binary_0101.read()?.expect_i64()?, 0b0101); // decimal 5 + + Ok(()) + } +} diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs new file mode 100644 index 00000000..e586f677 --- /dev/null +++ b/src/lazy/text/value.rs @@ -0,0 +1,66 @@ +use crate::lazy::decoder::private::LazyRawValuePrivate; +use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::MatchedValue; +use crate::{IonResult, IonType, RawSymbolTokenRef}; +use std::fmt; +use std::fmt::{Debug, Formatter}; + +/// A value that has been identified in the text input stream but whose data has not yet been read. +/// +/// If only part of the value is in the input buffer, calls to [`LazyRawTextValue::read`] (which examines +/// bytes beyond the value's header) may return [`IonError::Incomplete`](crate::result::IonError::Incomplete). +/// +/// `LazyRawTextValue`s are "unresolved," which is to say that symbol values, annotations, and +/// struct field names may or may not include a text definition. (This is less common in Ion's text +/// format than in its binary format, but is still possible.) For a resolved lazy value that +/// includes a text definition for these items whenever one exists, see +/// [`crate::lazy::value::LazyValue`]. +#[derive(Clone)] +pub struct LazyRawTextValue<'data> { + pub(crate) encoded_value: EncodedTextValue, + pub(crate) input: TextBufferView<'data>, +} + +impl<'data> LazyRawValuePrivate<'data> for LazyRawTextValue<'data> { + fn field_name(&self) -> Option> { + todo!() + } +} + +impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { + fn ion_type(&self) -> IonType { + self.encoded_value.ion_type() + } + + fn is_null(&self) -> bool { + self.encoded_value.is_null() + } + + fn annotations(&self) -> >::AnnotationsIterator { + todo!() + } + + fn read(&self) -> IonResult> { + let matched_input = self.input.slice(0, self.encoded_value.data_length()); + let value_ref = match self.encoded_value.matched() { + MatchedValue::Null(ion_type) => RawValueRef::Null(ion_type), + MatchedValue::Bool(b) => RawValueRef::Bool(b), + MatchedValue::Int(i) => RawValueRef::Int(i.read(matched_input)?), + }; + Ok(value_ref) + } +} + +impl<'a> Debug for LazyRawTextValue<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "LazyRawTextValue {{\n val={:?},\n buf={:?}\n}}\n", + self.encoded_value, self.input + ) + } +} diff --git a/src/lazy/value.rs b/src/lazy/value.rs index d7ae1b64..8f09cdbf 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -1,5 +1,5 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::LazySequence; use crate::lazy::value_ref::ValueRef; diff --git a/src/position.rs b/src/position.rs index bb5d9648..413d82db 100644 --- a/src/position.rs +++ b/src/position.rs @@ -7,6 +7,7 @@ use std::fmt::{Display, Error}; #[derive(Clone, Debug, PartialEq, Eq)] pub struct Position { pub(crate) byte_offset: usize, + pub(crate) byte_length: Option, pub(crate) line_column: Option<(usize, usize)>, } @@ -16,16 +17,20 @@ impl Position { pub fn with_offset(offset: usize) -> Self { Position { byte_offset: offset, + byte_length: None, line_column: None, } } + pub fn with_length(mut self, length: usize) -> Self { + self.byte_length = Some(length); + self + } + /// Add line and column information to the current Position. - pub fn with_line_and_column(&self, line: usize, column: usize) -> Self { - Position { - line_column: Some((line, column)), - ..*self - } + pub fn with_line_and_column(mut self, line: usize, column: usize) -> Self { + self.line_column = Some((line, column)); + self } /// Returns the offset from the start of the Ion stream in bytes. @@ -33,17 +38,22 @@ impl Position { self.byte_offset } - /// If available returns the text position as line and column offsets. + /// If available, returns the length of the input slice in question. + pub fn byte_length(&self) -> Option { + self.byte_length + } + + /// If available, returns the text position as line and column offsets. pub fn line_and_column(&self) -> Option<(usize, usize)> { self.line_column } - /// If available returns the line component of the text position. + /// If available, returns the line component of the text position. pub fn line(&self) -> Option { self.line_column.map(|(line, _column)| line) } - /// If available returns the column component of the text position. + /// If available, returns the column component of the text position. pub fn column(&self) -> Option { self.line_column.map(|(_line, column)| column) } diff --git a/src/result/decoding_error.rs b/src/result/decoding_error.rs index e2fb39af..ade5d501 100644 --- a/src/result/decoding_error.rs +++ b/src/result/decoding_error.rs @@ -1,3 +1,4 @@ +use crate::position::Position; use std::borrow::Cow; use thiserror::Error; @@ -6,12 +7,23 @@ use thiserror::Error; #[error("{description}")] pub struct DecodingError { description: Cow<'static, str>, + position: Option, } impl DecodingError { pub(crate) fn new(description: impl Into>) -> Self { DecodingError { description: description.into(), + position: None, } } + + pub(crate) fn with_position(mut self, position: impl Into) -> Self { + self.position = Some(position.into()); + self + } + + pub fn position(&self) -> Option<&Position> { + self.position.as_ref() + } } diff --git a/src/result/incomplete.rs b/src/result/incomplete.rs index c47c721d..896b9a16 100644 --- a/src/result/incomplete.rs +++ b/src/result/incomplete.rs @@ -1,4 +1,5 @@ use crate::position::Position; +use std::borrow::Cow; use thiserror::Error; /// For non-blocking readers, indicates that there was not enough data available in the input buffer @@ -6,14 +7,14 @@ use thiserror::Error; #[derive(Clone, Debug, Error, PartialEq)] #[error("ran out of input while reading {label} at offset {position}")] pub struct IncompleteError { - label: &'static str, + label: Cow<'static, str>, position: Position, } impl IncompleteError { - pub(crate) fn new(label: &'static str, position: impl Into) -> Self { + pub(crate) fn new(label: impl Into>, position: impl Into) -> Self { IncompleteError { - label, + label: label.into(), position: position.into(), } } diff --git a/src/result/mod.rs b/src/result/mod.rs index 8b47a476..157184ac 100644 --- a/src/result/mod.rs +++ b/src/result/mod.rs @@ -82,14 +82,14 @@ pub(crate) trait IonFailure { // an `IonError::Io` is by converting a `std::io::IoError` with the ? operator. // Because this trait is only crate-visible, methods can be added/changed as needed in // the future. - fn incomplete(label: &'static str, position: impl Into) -> Self; + fn incomplete(label: impl Into>, position: impl Into) -> Self; fn decoding_error>>(description: S) -> Self; fn encoding_error>>(description: S) -> Self; fn illegal_operation>>(operation: S) -> Self; } impl IonFailure for IonError { - fn incomplete(label: &'static str, position: impl Into) -> Self { + fn incomplete(label: impl Into>, position: impl Into) -> Self { IncompleteError::new(label, position).into() } @@ -107,7 +107,7 @@ impl IonFailure for IonError { } impl IonFailure for IonResult { - fn incomplete(label: &'static str, position: impl Into) -> Self { + fn incomplete(label: impl Into>, position: impl Into) -> Self { Err(IonError::incomplete(label, position)) }