From e02d499eab4c36574cc74e5df4e3964613a519e6 Mon Sep 17 00:00:00 2001 From: Richard Giliam Date: Fri, 7 Jun 2024 05:08:32 -0700 Subject: [PATCH] Add support for delimited containers to the 1.1 binary reader --- src/lazy/binary/raw/v1_1/immutable_buffer.rs | 196 +++++++++++++++---- src/lazy/binary/raw/v1_1/reader.rs | 157 ++++++++++++++- src/lazy/binary/raw/v1_1/sequence.rs | 64 ++++-- src/lazy/binary/raw/v1_1/struct.rs | 80 ++++++-- src/lazy/binary/raw/v1_1/type_code.rs | 25 ++- src/lazy/binary/raw/v1_1/type_descriptor.rs | 16 ++ src/lazy/binary/raw/v1_1/value.rs | 5 + 7 files changed, 464 insertions(+), 79 deletions(-) diff --git a/src/lazy/binary/raw/v1_1/immutable_buffer.rs b/src/lazy/binary/raw/v1_1/immutable_buffer.rs index 5cb60868c..d46287008 100644 --- a/src/lazy/binary/raw/v1_1/immutable_buffer.rs +++ b/src/lazy/binary/raw/v1_1/immutable_buffer.rs @@ -14,7 +14,7 @@ use crate::lazy::decoder::{LazyRawValueExpr, RawValueExpr}; use crate::lazy::encoder::binary::v1_1::fixed_int::FixedInt; use crate::lazy::encoder::binary::v1_1::fixed_uint::FixedUInt; use crate::lazy::encoder::binary::v1_1::flex_int::FlexInt; -use crate::lazy::encoder::binary::v1_1::flex_sym::FlexSym; +use crate::lazy::encoder::binary::v1_1::flex_sym::{FlexSym, FlexSymValue}; use crate::lazy::encoder::binary::v1_1::flex_uint::FlexUInt; use crate::lazy::expanded::macro_table::MacroKind; use crate::lazy::expanded::EncodingContextRef; @@ -22,6 +22,7 @@ use crate::lazy::text::raw::v1_1::reader::MacroIdRef; use crate::result::IonFailure; use crate::{v1_1, HasRange, IonError, IonResult}; + /// A buffer of unsigned bytes that can be cheaply copied and which defines methods for parsing /// the various encoding elements of a binary Ion stream. /// @@ -247,7 +248,7 @@ impl<'a> ImmutableBuffer<'a> { /// from the buffer to interpret as the value's length. If it is successful, returns an `Ok(_)` /// containing a [FlexUInt] representation of the value's length. If no additional bytes were /// read, the returned `FlexUInt`'s `size_in_bytes()` method will return `0`. - pub fn read_value_length(self, header: Header) -> ParseResult<'a, FlexUInt> { + pub fn read_value_length(self, header: Header) -> ParseResult<'a, Option> { let length = match header.length_type() { LengthType::InOpcode(n) => { // FlexUInt represents the length, but is not physically present, hence the 0 size. @@ -257,13 +258,14 @@ impl<'a> ImmutableBuffer<'a> { let (flexuint, _) = self.read_flex_uint()?; flexuint } + LengthType::Unknown => return Ok((None, self)), }; let remaining = self; // TODO: Validate length to ensure it is a reasonable value. - Ok((length, remaining)) + Ok((Some(length), remaining)) } /// Reads a value without a field name from the buffer. This is applicable in lists, s-expressions, @@ -275,76 +277,202 @@ impl<'a> ImmutableBuffer<'a> { return Ok(None); } let mut input = self; - let mut type_descriptor = input.peek_opcode()?; + let mut opcode = input.peek_opcode()?; // If we find a NOP... - if type_descriptor.is_nop() { + if opcode.is_nop() { // ...skip through NOPs until we found the next non-NOP byte. - (_, input) = self.consume_nop_padding(type_descriptor)?; + (_, input) = self.consume_nop_padding(opcode)?; // If there is no next byte, we're out of values. if input.is_empty() { return Ok(None); } // Otherwise, there's a value. - type_descriptor = input.peek_opcode()?; + opcode = input.peek_opcode()?; } - if type_descriptor.is_e_expression() { + if opcode.is_e_expression() { return Ok(Some(RawValueExpr::EExp( - self.read_e_expression(type_descriptor)?, + self.read_e_expression(opcode)?, ))); } Ok(Some(RawValueExpr::ValueLiteral( - input.read_value(type_descriptor)?, + input.read_value(opcode)?, ))) } - /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that - /// the next byte (`type_descriptor`) is not a NOP. - pub fn read_value(self, opcode: Opcode) -> IonResult> { - if opcode.is_annotations_sequence() { - self.read_annotated_value(opcode) + pub(crate) fn peek_delimited_container( + self, + opcode: Opcode, + ) -> IonResult> { + use crate::IonType; + + if let Some(IonType::Struct) = opcode.ion_type { + self.peek_delimited_struct() } else { - self.read_value_without_annotations(opcode) + self.peek_delimited_sequence() } } - /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that - /// the next byte (`type_descriptor`) is neither a NOP nor an annotations wrapper. - fn read_value_without_annotations( - self, - type_descriptor: Opcode, - ) -> IonResult> { - let input = self; - let header = type_descriptor + pub(crate) fn peek_delimited_sequence(self) -> IonResult> { + let head_opcode = self.peek_opcode()?; + + let mut input = self.consume(1); + let header_offset = input.offset(); + let mut offsets = BumpVec::::new_in(self.context.allocator()); + + loop { + let opcode = input.peek_opcode()?; + if opcode.opcode_type == OpcodeType::DelimitedContainerClose { + offsets.push(input.offset()); + break; + } else if opcode.opcode_type == OpcodeType::Nop { + let res = input.consume_nop_padding(opcode)?; + input = res.1; + } else if let Some(value) = ImmutableBuffer::peek_sequence_value_expr(input)? { + offsets.push(input.offset()); + input = input.consume(value.range().len()); + } + } + + let header = head_opcode .to_header() .ok_or_else(|| IonError::decoding_error("found a non-value in value position"))?; + let value_body_length = *offsets.last().unwrap() - header_offset; + let total_length = 2 + value_body_length; // Opcode + Delimiter + Length + + let encoded_value = EncodedValue { + header, + annotations_header_length: 0, + annotations_sequence_length: 0, + annotations_encoding: AnnotationsEncoding::SymbolAddress, + header_offset, + length_length: 0, + total_length, + value_body_length, + }; + let lazy_value = LazyRawBinaryValue_1_1 { + encoded_value, + input: self, + delimited_offsets: Some(offsets.into_bump_slice()), + }; + Ok(lazy_value) + } + + pub(crate) fn peek_delimited_struct(self) -> IonResult> { + use crate::lazy::binary::raw::v1_1::OpcodeType; + + let head_opcode = self.peek_opcode()?; + let mut input = self.consume(1); + let mut offsets = BumpVec::::new_in(self.context.allocator()); let header_offset = input.offset(); - let (length, _) = input.consume(1).read_value_length(header)?; - let length_length = length.size_in_bytes() as u8; - let value_length = length.value() as usize; // ha - let total_length = 1 // Header byte - + length_length as usize - + value_length; + + loop { + let opcode = input.peek_opcode()?; + if opcode.opcode_type == OpcodeType::DelimitedContainerClose { + offsets.push(input.offset()); + break; + } else { + let (flexsym, after) = input.read_flex_sym()?; + let field_offset = match flexsym.value() { + FlexSymValue::SymbolRef(_sym) => input.offset(), + FlexSymValue::Opcode(_op) => todo!(), + }; + input = after; + + let mut opcode = input.peek_opcode()?; + if opcode.opcode_type == OpcodeType::Nop { + let res = input.consume_nop_padding(opcode)?; + input = res.1; + opcode = input.peek_opcode()?; + } + let value = input.read_value(opcode)?; + input = input.consume(value.encoded_value.total_length()); + offsets.push(field_offset); + } + } + + let header = head_opcode + .to_header() + .ok_or_else(|| IonError::decoding_error("found a non-value in value position"))?; + let value_body_length = *offsets.last().unwrap() - header_offset; + let total_length = 2 + value_body_length; let encoded_value = EncodedValue { header, - // If applicable, these are populated by the caller: `read_annotated_value()` annotations_header_length: 0, annotations_sequence_length: 0, annotations_encoding: AnnotationsEncoding::SymbolAddress, header_offset, - length_length, - value_body_length: value_length, + length_length: 0, total_length, + value_body_length, }; + let lazy_value = LazyRawBinaryValue_1_1 { encoded_value, - // If this value has a field ID or annotations, this will be replaced by the caller. input: self, + delimited_offsets: Some(offsets.into_bump_slice()), }; + Ok(lazy_value) } + /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that + /// the next byte (`type_descriptor`) is not a NOP. + pub fn read_value(self, opcode: Opcode) -> IonResult> { + if opcode.is_annotations_sequence() { + self.read_annotated_value(opcode) + } else { + self.read_value_without_annotations(opcode) + } + } + + /// Reads a value from the buffer. The caller must confirm that the buffer is not empty and that + /// the next byte (`type_descriptor`) is neither a NOP nor an annotations wrapper. + fn read_value_without_annotations( + self, + opcode: Opcode, + ) -> IonResult> { + let input = self; + let header = opcode + .to_header() + .ok_or_else(|| IonError::decoding_error("found a non-value in value position .."))?; + + if opcode.is_delimited() { + self.peek_delimited_container(opcode) + } else { + let header_offset = input.offset(); + let length = match input.consume(1).read_value_length(header)? { + (None, _) => FlexUInt::new(0, 0), // Delimited value, we do not know the size. + (Some(length), _) => length, + }; + let length_length = length.size_in_bytes() as u8; + let value_length = length.value() as usize; // ha + let total_length = 1 // Header byte + + length_length as usize + + value_length; + + let encoded_value = EncodedValue { + header, + // If applicable, these are populated by the caller: `read_annotated_value()` + annotations_header_length: 0, + annotations_sequence_length: 0, + annotations_encoding: AnnotationsEncoding::SymbolAddress, + header_offset, + length_length, + value_body_length: value_length, + total_length, + }; + let lazy_value = LazyRawBinaryValue_1_1 { + encoded_value, + // If this value has a field ID or annotations, this will be replaced by the caller. + input: self, + delimited_offsets: None, + }; + Ok(lazy_value) + } + } + pub fn read_fixed_int(self, length: usize) -> ParseResult<'a, FixedInt> { let int_bytes = self .peek_n_bytes(length) diff --git a/src/lazy/binary/raw/v1_1/reader.rs b/src/lazy/binary/raw/v1_1/reader.rs index d372158e1..7c94cd4e7 100644 --- a/src/lazy/binary/raw/v1_1/reader.rs +++ b/src/lazy/binary/raw/v1_1/reader.rs @@ -89,14 +89,14 @@ impl<'data> LazyRawBinaryReader_1_1<'data> { return Ok(self.end_of_stream(buffer.offset())); } - let type_descriptor = buffer.peek_opcode()?; - if type_descriptor.is_nop() { - (_, buffer) = buffer.consume_nop_padding(type_descriptor)?; + let opcode = buffer.peek_opcode()?; + if opcode.is_nop() { + (_, buffer) = buffer.consume_nop_padding(opcode)?; if buffer.is_empty() { return Ok(self.end_of_stream(buffer.offset())); } } - if type_descriptor.is_ivm_start() { + if opcode.is_ivm_start() { return self.read_ivm(buffer); } self.read_value_expr(buffer) @@ -143,9 +143,12 @@ mod tests { use crate::lazy::binary::raw::v1_1::reader::LazyRawBinaryReader_1_1; use crate::lazy::expanded::EncodingContext; + use crate::lazy::decoder::LazyRawSequence; use crate::raw_symbol_ref::RawSymbolRef; use crate::{IonResult, IonType}; + use bumpalo::Bump as BumpAllocator; + #[test] fn nop() -> IonResult<()> { let data: Vec = vec![ @@ -712,6 +715,58 @@ mod tests { Ok(()) } + #[test] + fn nested_sequence() -> IonResult<()> { + let ion_data: &[u8] = &[0xF1, 0x61, 0x01, 0xF1, 0x61, 0x02, 0xF0, 0x61, 0x03, 0xF0]; + let allocator = BumpAllocator::new(); + let empty_context = EncodingContext::empty(); + let context = empty_context.get_ref(); + + let mut reader = LazyRawBinaryReader_1_1::new(ion_data); + let container = reader + .next(context)? + .expect_value()? + .read()? + .expect_list()?; + + let mut top_iter = container.iter(); + let actual_value = top_iter + .next() + .unwrap()? + .expect_value()? + .read()? + .expect_int()?; + assert_eq!(actual_value, 1.into()); + + let actual_value = top_iter + .next() + .unwrap()? + .expect_value()? + .read()? + .expect_list()?; + + let mut inner_iter = actual_value.iter(); + let actual_value = inner_iter + .next() + .unwrap()? + .expect_value()? + .read()? + .expect_int()?; + assert_eq!(actual_value, 2.into()); + + let actual_value = top_iter + .next() + .unwrap()? + .expect_value()? + .read()? + .expect_int()?; + assert_eq!(actual_value, 3.into()); + + assert!(top_iter.next().is_none()); + + Ok(()) + } + #[test] fn lists() -> IonResult<()> { use crate::lazy::decoder::LazyRawSequence; @@ -761,8 +816,22 @@ mod tests { // [] (&[0xFB, 0x03, 0xEC], &[]), + + // [] (delimited) + (&[0xF1, 0xF0], &[]), + + // [1] (delimited) + (&[0xF1, 0x61, 0x01, 0xF0], &[IonType::Int]), + + // [ 1 [2] 3 ] (delimited) + (&[0xF1, 0x61, 0x01, 0xF1, 0xEA, 0xF0, 0x61, 0x03, 0xF0], &[IonType::Int, IonType::List, IonType::Int]), + + // [] + (&[0xF1, 0xEC, 0xF0], &[]), ]; + let allocator = BumpAllocator::new(); + for (ion_data, expected_types) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); @@ -818,8 +887,22 @@ mod tests { // ( $257 ) (&[0xFC, 0x07, 0xE2, 0x01, 0x00], &[IonType::Symbol]), + + // () (delimited) + (&[0xF2, 0xF0], &[]), + + // ( 1 ) (delimited) + (&[0xF2, 0x61, 0x01, 0xF0], &[IonType::Int]), + + // ( 1 ( 2 ) 3 ) (delimited) + (&[0xF2, 0x61, 0x01, 0xF2, 0x61, 0x02, 0xF0, 0x61, 0x03, 0xF0], &[IonType::Int, IonType::SExp, IonType::Int]), + + // () (delimited) + (&[0xF2, 0xEC, 0xF0], &[]), ]; + let allocator = BumpAllocator::new(); + for (ion_data, expected_types) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); @@ -859,6 +942,7 @@ mod tests { ([0xEB, 0x0B], IonType::Struct), // null.struct ]; + let allocator = BumpAllocator::new(); for (data, expected_type) in data { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); @@ -873,6 +957,55 @@ mod tests { Ok(()) } + #[test] + fn nested_struct() -> IonResult<()> { + use crate::lazy::decoder::LazyRawFieldName; + let ion_data: &[u8] = &[ + 0xF3, // { + 0xFB, 0x66, 0x6F, 0x6F, 0x61, 0x01, // "foo": 1 + 0x17, 0xF3, // 11: { + 0xFB, 0x62, 0x61, 0x72, 0x61, 0x02, // "bar": 2 + 0xF0, // } + 0xFB, 0x62, 0x61, 0x7a, 0x61, 0x03, // "baz": 3 + 0xF0, // } + ]; + + let encoding_context = EncodingContext::empty(); + let context = encoding_context.get_ref(); + + let mut reader = LazyRawBinaryReader_1_1::new(ion_data); + let container = reader + .next(context)? + .expect_value()? + .read()? + .expect_struct()?; + + let mut top_iter = container.iter(); + + let (name, value) = top_iter.next().unwrap()?.expect_name_value()?; + assert_eq!(name.read()?, RawSymbolRef::Text("foo")); + assert_eq!(value.read()?.expect_int()?, 1.into()); + + let (name, value) = top_iter.next().unwrap()?.expect_name_value()?; + assert_eq!(name.read()?, RawSymbolRef::SymbolId(11)); + let mut inner_iter = value.read()?.expect_struct()?.iter(); + + let (name, value) = inner_iter.next().unwrap()?.expect_name_value()?; + assert_eq!(name.read()?, RawSymbolRef::Text("bar")); + assert_eq!(value.read()?.expect_int()?, 2.into()); + + assert!(inner_iter.next().is_none()); + + let (name, value) = top_iter.next().unwrap()?.expect_name_value()?; + assert_eq!(name.read()?, RawSymbolRef::Text("baz")); + assert_eq!(value.read()?.expect_int()?, 3.into()); + + assert!(top_iter.next().is_none()); + + Ok(()) + } + + #[test] fn structs() -> IonResult<()> { use crate::lazy::decoder::{LazyRawFieldExpr, LazyRawFieldName}; @@ -917,7 +1050,7 @@ mod tests { ), ( // { $10: { $11: "foo" }, $11: 2 } - &[ 0xD6, 0x15, 0xC4, 0x93, 0x66, 0x6F, 0x6F, 0x17, 0x61, 0x02 ], + &[ 0xD6, 0x15, 0xD4, 0x93, 0x66, 0x6F, 0x6F, 0x17, 0x61, 0x02 ], &[ (10usize.into(), IonType::Struct), (11usize.into(), IonType::Int), @@ -973,7 +1106,19 @@ mod tests { // {} &[0xFD, 0x01], &[], - ) + ), + ( + // {} - delimited + &[ + 0xF3, 0xF0, + ], + &[], + ), + ( + // { "foo": 1, $11: 2 } - delimited + &[ 0xF3, 0xFB, 0x66, 0x6F, 0x6F, 0x61, 0x01, 0x17, 0xE1, 0x02, 0xF0], + &[ ("foo".into(), IonType::Int), (11usize.into(), IonType::Symbol)], + ), ]; for (ion_data, field_pairs) in tests { diff --git a/src/lazy/binary/raw/v1_1/sequence.rs b/src/lazy/binary/raw/v1_1/sequence.rs index fc58f9b97..0fd25033d 100644 --- a/src/lazy/binary/raw/v1_1/sequence.rs +++ b/src/lazy/binary/raw/v1_1/sequence.rs @@ -4,7 +4,7 @@ use crate::lazy::binary::raw::v1_1::annotations_iterator::RawBinaryAnnotationsIt use crate::lazy::binary::raw::v1_1::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::v1_1::value::LazyRawBinaryValue_1_1; use crate::lazy::decoder::private::LazyContainerPrivate; -use crate::lazy::decoder::{Decoder, LazyRawContainer, LazyRawSequence, LazyRawValueExpr}; +use crate::lazy::decoder::{Decoder, LazyRawContainer, LazyRawSequence, LazyRawValueExpr, RawValueExpr}; use crate::lazy::encoding::BinaryEncoding_1_1; use crate::{HasRange, IonResult, IonType}; use std::fmt::{Debug, Formatter}; @@ -85,6 +85,10 @@ pub struct LazyRawBinarySequence_1_1<'top> { } impl<'top> LazyRawBinarySequence_1_1<'top> { + pub fn new(value: LazyRawBinaryValue_1_1<'top>) -> Self { + Self { value } + } + pub fn ion_type(&self) -> IonType { self.value.ion_type() } @@ -92,8 +96,12 @@ impl<'top> LazyRawBinarySequence_1_1<'top> { pub fn iter(&self) -> RawBinarySequenceIterator_1_1<'top> { // Get as much of the sequence's body as is available in the input buffer. // Reading a child value may fail as `Incomplete` - let buffer_slice = self.value.available_body(); - RawBinarySequenceIterator_1_1::new(buffer_slice) + let buffer_slice = if self.value.is_delimited() { + self.value.input + } else { + self.value.available_body() + }; + RawBinarySequenceIterator_1_1::new(buffer_slice, self.value.delimited_offsets) } } @@ -133,13 +141,18 @@ impl<'a> Debug for LazyRawBinarySequence_1_1<'a> { pub struct RawBinarySequenceIterator_1_1<'top> { source: ImmutableBuffer<'top>, bytes_to_skip: usize, + delimited_offsets: Option<&'top [usize]>, } impl<'top> RawBinarySequenceIterator_1_1<'top> { - pub(crate) fn new(input: ImmutableBuffer<'top>) -> RawBinarySequenceIterator_1_1<'top> { + pub(crate) fn new( + input: ImmutableBuffer<'top>, + delimited_offsets: Option<&'top [usize]>, + ) -> RawBinarySequenceIterator_1_1<'top> { RawBinarySequenceIterator_1_1 { source: input, bytes_to_skip: 0, + delimited_offsets, } } } @@ -148,13 +161,40 @@ impl<'top> Iterator for RawBinarySequenceIterator_1_1<'top> { type Item = IonResult>; fn next(&mut self) -> Option { - self.source = self.source.consume(self.bytes_to_skip); - let item = match self.source.peek_sequence_value_expr() { - Ok(Some(expr)) => expr, - Ok(None) => return None, - Err(e) => return Some(Err(e)), - }; - self.bytes_to_skip = item.range().len(); - Some(Ok(item)) + use crate::lazy::binary::raw::v1_1::type_code::OpcodeType; + use crate::lazy::binary::raw::v1_1::type_descriptor::Opcode; + + if let Some(offsets) = self.delimited_offsets { + if offsets.len() <= 1 { + None + } else { + let offset = offsets.first().unwrap(); // Safety: Already tested that there's > 1 item. + let input = self.source.consume(*offset - self.source.offset()); + match input.peek_opcode() { + Ok(Opcode { + opcode_type: OpcodeType::DelimitedContainerClose, + .. + }) => None, + Ok(_) => match input.peek_sequence_value_expr() { + Ok(Some(output)) => { + self.delimited_offsets.replace(&offsets[1..]); + Some(Ok(output)) + } + Ok(None) => None, + Err(e) => Some(Err(e)), + }, + Err(e) => Some(Err(e)), + } + } + } else { + self.source = self.source.consume(self.bytes_to_skip); + let item = match self.source.peek_sequence_value_expr() { + Ok(Some(expr)) => expr, + Ok(None) => return None, + Err(e) => return Some(Err(e)), + }; + self.bytes_to_skip = item.range().len(); + Some(Ok(item)) + } } } diff --git a/src/lazy/binary/raw/v1_1/struct.rs b/src/lazy/binary/raw/v1_1/struct.rs index 207d3eb75..8241c3832 100644 --- a/src/lazy/binary/raw/v1_1/struct.rs +++ b/src/lazy/binary/raw/v1_1/struct.rs @@ -85,13 +85,22 @@ impl<'top> LazyRawBinaryStruct_1_1<'top> { } pub fn iter(&self) -> RawBinaryStructIterator_1_1<'top> { - // Get as much of the struct's body as is available in the input buffer. - // Reading a child value may fail as `Incomplete` - let buffer_slice = self.value.available_body(); - RawBinaryStructIterator_1_1::new( - self.value.encoded_value.header.ion_type_code, - buffer_slice, - ) + if self.value.is_delimited() { + RawBinaryStructIterator_1_1::new( + self.value.encoded_value.header.ion_type_code, + self.value.input.consume(1), + self.value.delimited_offsets, + ) + } else { + // Get as much of the struct's body as is available in the input buffer. + // Reading a child value may fail as `Incomplete` + let buffer_slice = self.value.available_body(); + RawBinaryStructIterator_1_1::new( + self.value.encoded_value.header.ion_type_code, + buffer_slice, + self.value.delimited_offsets, + ) + } } } @@ -128,21 +137,24 @@ pub struct RawBinaryStructIterator_1_1<'top> { source: ImmutableBuffer<'top>, bytes_to_skip: usize, struct_type: StructType, + delimited_offsets: Option<&'top [usize]>, } impl<'top> RawBinaryStructIterator_1_1<'top> { pub(crate) fn new( opcode_type: OpcodeType, input: ImmutableBuffer<'top>, + delimited_offsets: Option<&'top [usize]>, ) -> RawBinaryStructIterator_1_1<'top> { RawBinaryStructIterator_1_1 { source: input, bytes_to_skip: 0, struct_type: match opcode_type { - // TODO: Delimited struct handling OpcodeType::Struct => StructType::SymbolAddress, + OpcodeType::StructDelimited => StructType::FlexSym, _ => unreachable!("Unexpected opcode for structure"), }, + delimited_offsets, } } @@ -215,8 +227,11 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { /// Helper function called from [`Self::next`] to parse the current field and value from the /// struct. On success, returns both the field pair via [`LazyRawFieldExpr`] as well as the /// total bytes needed to skip the field. - fn peek_field(&self) -> IonResult, usize)>> { - let mut buffer = self.source; + fn peek_field( + &self, + input: ImmutableBuffer<'top>, + ) -> IonResult, usize)>> { + let mut buffer = input; loop { // Peek at our field name. let peek_result = match self.struct_type { @@ -246,7 +261,7 @@ impl<'top> RawBinaryStructIterator_1_1<'top> { (Some(value), after) => (value, after), }; - let bytes_to_skip = after_value.offset() - self.source.offset(); + let bytes_to_skip = after_value.offset() - input.offset(); return Ok(Some(( LazyRawFieldExpr::NameValue(field_name, value), bytes_to_skip, @@ -259,13 +274,40 @@ impl<'top> Iterator for RawBinaryStructIterator_1_1<'top> { type Item = IonResult>; fn next(&mut self) -> Option { - self.source = self.source.consume(self.bytes_to_skip); - let (field_expr, bytes_to_skip) = match self.peek_field() { - Ok(Some((value, bytes_to_skip))) => (Some(Ok(value)), bytes_to_skip), - Ok(None) => (None, 0), - Err(e) => (Some(Err(e)), 0), - }; - self.bytes_to_skip = bytes_to_skip; - field_expr + use crate::lazy::binary::raw::v1_1::type_descriptor::Opcode; + + if let Some(offsets) = self.delimited_offsets { + if offsets.len() <= 1 { + None + } else { + let offset = offsets.first().unwrap(); + let input = self.source.consume(*offset - self.source.offset()); + let field_expr = match input.peek_opcode() { + Ok(Opcode { + opcode_type: OpcodeType::DelimitedContainerClose, + .. + }) => None, + Ok(_) => match self.peek_field(input) { + Ok(Some((value, _))) => { + self.delimited_offsets.replace(&offsets[1..]); + Some(Ok(value)) + } + Ok(None) => None, + Err(e) => Some(Err(e)), + }, + Err(e) => Some(Err(e)), + }; + field_expr + } + } else { + self.source = self.source.consume(self.bytes_to_skip); + let (field_expr, bytes_to_skip) = match self.peek_field(self.source) { + Ok(Some((value, bytes_to_skip))) => (Some(Ok(value)), bytes_to_skip), + Ok(None) => (None, 0), + Err(e) => (Some(Err(e)), 0), + }; + self.bytes_to_skip = bytes_to_skip; + field_expr + } } } diff --git a/src/lazy/binary/raw/v1_1/type_code.rs b/src/lazy/binary/raw/v1_1/type_code.rs index ff91f24f4..3ecaa5717 100644 --- a/src/lazy/binary/raw/v1_1/type_code.rs +++ b/src/lazy/binary/raw/v1_1/type_code.rs @@ -36,14 +36,14 @@ pub enum OpcodeType { TypedNull, // 0xEB - Nop, // 0xEC-0xED - // Reserved - SystemMacroInvoke, // 0xEF - - // 0xF0 delimited container end - // 0xF1 delimited list start - // 0xF2 delimited s-expression start - // 0xF3 delimited struct start - LargeInteger, // 0xF6 - Integer preceded by FlexUInt length - Blob, // 0xFE - - Clob, // 0xFF - + SystemMacroInvoke, // 0xEF - + DelimitedContainerClose, // 0xF0 + ListDelimited, // 0xF1 + SExpressionDelimited, // 0xF2 + StructDelimited, // 0xF3 + LargeInteger, // 0xF6 - Integer preceded by FlexUInt length + Blob, // 0xFE - + Clob, // 0xFF - // 0xF8 Long decimal TimestampLong, // 0xF8 - Long-form Timestamp // 0xF9 - Long string @@ -54,6 +54,15 @@ pub enum OpcodeType { Invalid, // Represents an encoded value that does not match a defined opcode. } +impl OpcodeType { + pub fn is_delimited(self) -> bool { + matches!( + self, + Self::ListDelimited | Self::SExpressionDelimited | Self::StructDelimited + ) + } +} + impl TryFrom for IonType { type Error = IonError; diff --git a/src/lazy/binary/raw/v1_1/type_descriptor.rs b/src/lazy/binary/raw/v1_1/type_descriptor.rs index f68c129e5..865b3298a 100644 --- a/src/lazy/binary/raw/v1_1/type_descriptor.rs +++ b/src/lazy/binary/raw/v1_1/type_descriptor.rs @@ -76,6 +76,10 @@ impl Opcode { (0xE, 0xA) => (NullNull, low_nibble, Some(IonType::Null)), (0xE, 0xB) => (TypedNull, low_nibble, Some(IonType::Null)), (0xE, 0xC..=0xD) => (Nop, low_nibble, None), + (0xF, 0x0) => (DelimitedContainerClose, low_nibble, None), + (0xF, 0x1) => (ListDelimited, low_nibble, Some(IonType::List)), + (0xF, 0x2) => (SExpressionDelimited, low_nibble, Some(IonType::SExp)), + (0xF, 0x3) => (StructDelimited, low_nibble, Some(IonType::Struct)), (0xF, 0x6) => (LargeInteger, low_nibble, Some(IonType::Int)), (0xF, 0x7) => (Decimal, 0xFF, Some(IonType::Decimal)), (0xF, 0x8) => (TimestampLong, low_nibble, Some(IonType::Timestamp)), @@ -125,6 +129,10 @@ impl Opcode { self.low_nibble } + pub fn is_delimited(&self) -> bool { + self.opcode_type.is_delimited() + } + #[inline] pub fn to_header(self) -> Option
{ let ion_type = self.ion_type?; @@ -140,6 +148,7 @@ impl Opcode { pub enum LengthType { InOpcode(u8), FlexUIntFollows, + Unknown, } /// Represents a `TypeDescriptor` that appears before an Ion value (and not a NOP, IVM, @@ -177,6 +186,13 @@ impl Header { } (OpcodeType::TypedNull, _) => InOpcode(1), (OpcodeType::Struct, n) if n < 16 => InOpcode(n), + (OpcodeType::DelimitedContainerClose, 0) => InOpcode(0), + ( + OpcodeType::ListDelimited + | OpcodeType::SExpressionDelimited + | OpcodeType::StructDelimited, + _, + ) => Unknown, _ => FlexUIntFollows, } } diff --git a/src/lazy/binary/raw/v1_1/value.rs b/src/lazy/binary/raw/v1_1/value.rs index 66418e6c0..4a5d8c950 100644 --- a/src/lazy/binary/raw/v1_1/value.rs +++ b/src/lazy/binary/raw/v1_1/value.rs @@ -84,6 +84,7 @@ impl<'top> RawVersionMarker<'top> for LazyRawBinaryVersionMarker_1_1<'top> { pub struct LazyRawBinaryValue_1_1<'top> { pub(crate) encoded_value: EncodedValue
, pub(crate) input: ImmutableBuffer<'top>, + pub(crate) delimited_offsets: Option<&'top [usize]>, } impl<'top> HasSpan<'top> for LazyRawBinaryValue_1_1<'top> { @@ -206,6 +207,10 @@ impl<'top> LazyRawBinaryValue_1_1<'top> { } } + pub fn is_delimited(&self) -> bool { + self.encoded_value.header.ion_type_code.is_delimited() + } + /// Returns the encoded byte slice representing this value's data. pub(crate) fn value_body(&self) -> IonResult<&'top [u8]> { let value_total_length = self.encoded_value.total_length();