From 09e620ba66e9d1e3d668506bca72e26a2a219e2d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 1 Feb 2023 17:53:39 +0000 Subject: [PATCH] Lazy array display (#3638) --- arrow-array/src/array/primitive_array.rs | 2 +- arrow-cast/src/display.rs | 639 ++++++++++++----------- 2 files changed, 324 insertions(+), 317 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index dfe076306178..1998cefc9d05 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1067,7 +1067,7 @@ impl From for PrimitiveArray { } } -impl PrimitiveArray { +impl PrimitiveArray { /// Returns a Decimal array with the same data as self, with the /// specified precision and scale. /// diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 16fbfb0bbce5..c6c48bcb8196 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -19,50 +19,301 @@ //! purposes. See the `pretty` crate for additional functions for //! record batch pretty printing. -use std::fmt::Write; +use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; +use arrow_array::temporal_conversions::*; use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::*; -use chrono::prelude::SecondsFormat; +use chrono::{DateTime, NaiveDate, TimeZone, Utc}; -macro_rules! make_string { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); +/// Options for formatting arrays +#[derive(Debug, Clone, Default)] +pub struct FormatOptions { + safe: bool, +} - Ok(array.value($row).to_string()) - }}; +impl FormatOptions { + /// If set to `true` any formatting errors will be written to the output + /// instead of being converted into a [`std::fmt::Error`] + pub fn with_display_error(mut self, safe: bool) -> Self { + self.safe = safe; + self + } } -macro_rules! make_string_interval_year_month { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); +/// Implements [`Display`] for a specific array value +pub struct ValueFormatter<'a> { + idx: usize, + formatter: &'a ArrayFormatter<'a>, +} + +impl<'a> Display for ValueFormatter<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.formatter.format.fmt(self.idx, f) { + Ok(()) => Ok(()), + Err(FormatError::Arrow(e)) if self.formatter.safe => { + write!(f, "ERROR: {}", e) + } + Err(_) => Err(std::fmt::Error), + } + } +} + +/// A string formatter for an [`Array`] +pub struct ArrayFormatter<'a> { + format: Box, + safe: bool, +} + +impl<'a> ArrayFormatter<'a> { + /// Returns an [`ArrayFormatter`] that can be used to format `array` + /// + /// This returns an error if an array of the given data type cannot be formatted + pub fn try_new( + array: &'a dyn Array, + options: &FormatOptions, + ) -> Result { + let format = downcast_primitive_array! { + array => Box::new(ArrayFormat::try_new(array)?) as _, + _ => todo!() + }; + + Ok(Self { + format, + safe: options.safe, + }) + } + + /// Returns a [`ValueFormatter`] that implements [`Display`] for + /// the value of the array at `idx` + pub fn value(&self, idx: usize) -> ValueFormatter<'_> { + ValueFormatter { + formatter: self, + idx, + } + } +} + +/// Either an [`ArrowError`] or [`std::fmt::Error`] +enum FormatError { + Format(std::fmt::Error), + Arrow(ArrowError), +} + +type FormatResult = Result<(), FormatError>; + +impl From for FormatError { + fn from(value: std::fmt::Error) -> Self { + Self::Format(value) + } +} + +impl From for FormatError { + fn from(value: ArrowError) -> Self { + Self::Arrow(value) + } +} + +/// [`Display`] but accepting an index +trait DisplayIndex { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult; +} + +/// [`DisplayIndex`] with additional state +trait DisplayIndexState { + type State; + + fn prepare(&self) -> Result; + + fn fmt(&self, state: &Self::State, idx: usize, f: &mut Formatter<'_>) + -> FormatResult; +} + +impl DisplayIndexState for T { + type State = (); + + fn prepare(&self) -> Result { + Ok(()) + } + + fn fmt(&self, _: &Self::State, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + DisplayIndex::fmt(self, idx, f) + } +} + +struct ArrayFormat { + state: F::State, + array: F, +} + +impl ArrayFormat { + fn try_new(array: F) -> Result { + let state = array.prepare()?; + Ok(Self { state, array }) + } +} + +impl DisplayIndex for ArrayFormat { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + if self.array.is_null(idx) { + return Ok(()); + } + DisplayIndexState::fmt(&self.array, &self.state, idx, f) + } +} + +macro_rules! primitive_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndex for &'a PrimitiveArray<$t> + { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + write!(f, "{}", self.value(idx))?; + Ok(()) + } + })+ + }; +} + +primitive_display!(Int8Type, Int16Type, Int32Type, Int64Type); +primitive_display!(UInt8Type, UInt16Type, UInt32Type, UInt64Type); +primitive_display!(Float16Type, Float32Type, Float64Type); + +macro_rules! decimal_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndexState for &'a PrimitiveArray<$t> { + type State = (u8, i8); + + fn prepare(&self) -> Result { + Ok((self.precision(), self.scale())) + } + + fn fmt(&self, s: &Self::State, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + write!(f, "{}", <$t>::format_decimal(self.values()[idx], s.0, s.1))?; + Ok(()) + } + })+ + }; +} + +decimal_display!(Decimal128Type, Decimal256Type); + +macro_rules! timestamp_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndexState for &'a PrimitiveArray<$t> { + type State = Option; + + fn prepare(&self) -> Result { + match self.data_type() { + DataType::Timestamp(_, tz) => tz.as_ref().map(|x| x.parse()).transpose(), + _ => unreachable!(), + } + } + + fn fmt(&self, tz: &Self::State, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + let value = self.value(idx); + let naive = as_datetime::<$t>(value).ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to convert {} to datetime for {}", + value, + self.data_type() + )) + })?; + + match tz { + Some(tz) => { + let date = Utc.from_utc_datetime(&naive).with_timezone(tz); + write!(f, "{}", format_rfc3339(date))?; + } + None => write!(f, "{:?}", naive)?, + } + Ok(()) + } + })+ + }; +} + +timestamp_display!( + TimestampSecondType, + TimestampMillisecondType, + TimestampMicrosecondType, + TimestampNanosecondType +); + +macro_rules! temporal_display { + ($convert:ident, $t:ty) => { + impl<'a> DisplayIndex for &'a PrimitiveArray<$t> { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + let value = self.value(idx); + let naive = $convert(value as _).ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to convert {} to temporal for {}", + value, + self.data_type() + )) + })?; + + write!(f, "{}", naive)?; + Ok(()) + } + } + }; +} + +#[inline] +fn date32_to_date(value: i32) -> Option { + Some(date32_to_datetime(value)?.date()) +} + +#[inline] +fn date64_to_date(value: i64) -> Option { + Some(date64_to_datetime(value)?.date()) +} - let interval = array.value($row) as f64; +temporal_display!(date32_to_date, Date32Type); +temporal_display!(date64_to_date, Date64Type); +temporal_display!(time32s_to_time, Time32SecondType); +temporal_display!(time32ms_to_time, Time32MillisecondType); +temporal_display!(time64us_to_time, Time64MicrosecondType); +temporal_display!(time64ns_to_time, Time64NanosecondType); + +macro_rules! duration_display { + ($convert:ident, $t:ty) => { + impl<'a> DisplayIndex for &'a PrimitiveArray<$t> { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + write!(f, "{}", $convert(self.value(idx)))?; + Ok(()) + } + } + }; +} + +duration_display!(duration_s_to_duration, DurationSecondType); +duration_display!(duration_ms_to_duration, DurationMillisecondType); +duration_display!(duration_us_to_duration, DurationMicrosecondType); +duration_display!(duration_ns_to_duration, DurationNanosecondType); + +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + let interval = self.value(idx) as f64; let years = (interval / 12_f64).floor(); let month = interval - (years * 12_f64); - Ok(format!( + write!( + f, "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", years, month, - )) - }}; + )?; + Ok(()) + } } -macro_rules! make_string_interval_day_time { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); - - let value: u64 = array.value($row) as u64; +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + let value: u64 = self.value(idx) as u64; let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; @@ -82,7 +333,8 @@ macro_rules! make_string_interval_day_time { "" }; - Ok(format!( + write!( + f, "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", days_parts, hours, @@ -90,98 +342,66 @@ macro_rules! make_string_interval_day_time { secs_sign, secs.abs(), milliseconds.abs(), - )) - }}; + )?; + Ok(()) + } } -macro_rules! make_string_interval_month_day_nano { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); - - let value: u128 = array.value($row) as u128; +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn fmt(&self, idx: usize, f: &mut Formatter<'_>) -> FormatResult { + let value: u64 = self.value(idx) as u64; - let months_part: i32 = - ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; - let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; - let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; + let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; - let secs = nanoseconds_part / 1_000_000_000; + let secs = milliseconds_part / 1_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let nanoseconds = nanoseconds_part % 1_000_000_000; + let milliseconds = milliseconds_part % 1_000; - let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; + let secs_sign = if secs < 0 || milliseconds < 0 { + "-" + } else { + "" + }; - Ok(format!( - "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", - months_part, - days_part, + write!( + f, + "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", + days_parts, hours, mins, secs_sign, secs.abs(), - nanoseconds.abs(), - )) - }}; -} - -macro_rules! make_string_date { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_date($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) - }}; -} - -macro_rules! make_string_time { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_time($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) - }}; -} - -macro_rules! make_string_datetime { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_datetime($row) - .map(|d| format!("{:?}", d)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) - }}; + milliseconds.abs(), + )?; + Ok(()) + } } -macro_rules! make_string_datetime_with_tz { - ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - let s = match $tz_string.parse::() { - Ok(tz) => array - .value_as_datetime_with_tz($row, tz) - .map(|d| format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true))) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - Err(_) => array - .value_as_datetime($row) - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - }; - - Ok(s) - }}; +fn format_rfc3339(date: DateTime) -> impl Display { + use chrono::format::*; + + const ITEMS: &[Item<'static>] = &[ + Item::Numeric(Numeric::Year, Pad::Zero), + Item::Literal("-"), + Item::Numeric(Numeric::Month, Pad::Zero), + Item::Literal("-"), + Item::Numeric(Numeric::Day, Pad::Zero), + Item::Literal("T"), + Item::Numeric(Numeric::Hour, Pad::Zero), + Item::Literal(":"), + Item::Numeric(Numeric::Minute, Pad::Zero), + Item::Literal(":"), + Item::Numeric(Numeric::Second, Pad::Zero), + Item::Fixed(Fixed::Nanosecond), + Item::Fixed(Fixed::TimezoneOffsetColonZ), + ]; + date.format_with_items(ITEMS.iter()) } // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex @@ -247,27 +467,6 @@ macro_rules! make_string_from_fixed_size_list { }}; } -macro_rules! make_string_from_duration { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_duration($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) - }}; -} - -#[inline(always)] -pub fn make_string_from_decimal( - column: &Arc, - row: usize, -) -> Result { - let array = column.as_any().downcast_ref::().unwrap(); - - Ok(array.value_as_string(row)) -} - fn append_struct_field_string( target: &mut String, name: &str, @@ -327,201 +526,9 @@ pub fn array_value_to_string( column: &ArrayRef, row: usize, ) -> Result { - if column.is_null(row) { - return Ok("".to_string()); - } - match column.data_type() { - DataType::Utf8 => make_string!(array::StringArray, column, row), - DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row), - DataType::Binary => make_string_hex!(array::BinaryArray, column, row), - DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row), - DataType::FixedSizeBinary(_) => { - make_string_hex!(array::FixedSizeBinaryArray, column, row) - } - DataType::Boolean => make_string!(array::BooleanArray, column, row), - DataType::Int8 => make_string!(array::Int8Array, column, row), - DataType::Int16 => make_string!(array::Int16Array, column, row), - DataType::Int32 => make_string!(array::Int32Array, column, row), - DataType::Int64 => make_string!(array::Int64Array, column, row), - DataType::UInt8 => make_string!(array::UInt8Array, column, row), - DataType::UInt16 => make_string!(array::UInt16Array, column, row), - DataType::UInt32 => make_string!(array::UInt32Array, column, row), - DataType::UInt64 => make_string!(array::UInt64Array, column, row), - DataType::Float16 => make_string!(array::Float16Array, column, row), - DataType::Float32 => make_string!(array::Float32Array, column, row), - DataType::Float64 => make_string!(array::Float64Array, column, row), - DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampSecondArray, - tz_string, - column, - row - ), - None => make_string_datetime!(array::TimestampSecondArray, column, row), - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMillisecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMillisecondArray, column, row) - } - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMicrosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMicrosecondArray, column, row) - } - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampNanosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampNanosecondArray, column, row) - } - } - } - DataType::Date32 => make_string_date!(array::Date32Array, column, row), - DataType::Date64 => make_string_date!(array::Date64Array, column, row), - DataType::Time32(unit) if *unit == TimeUnit::Second => { - make_string_time!(array::Time32SecondArray, column, row) - } - DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - make_string_time!(array::Time32MillisecondArray, column, row) - } - DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - make_string_time!(array::Time64MicrosecondArray, column, row) - } - DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - make_string_time!(array::Time64NanosecondArray, column, row) - } - DataType::Interval(unit) => match unit { - IntervalUnit::DayTime => { - make_string_interval_day_time!(column, row) - } - IntervalUnit::YearMonth => { - make_string_interval_year_month!(column, row) - } - IntervalUnit::MonthDayNano => { - make_string_interval_month_day_nano!(column, row) - } - }, - DataType::List(_) => make_string_from_list!(column, row), - DataType::LargeList(_) => make_string_from_large_list!(column, row), - DataType::Dictionary(index_type, _value_type) => match **index_type { - DataType::Int8 => dict_array_value_to_string::(column, row), - DataType::Int16 => dict_array_value_to_string::(column, row), - DataType::Int32 => dict_array_value_to_string::(column, row), - DataType::Int64 => dict_array_value_to_string::(column, row), - DataType::UInt8 => dict_array_value_to_string::(column, row), - DataType::UInt16 => dict_array_value_to_string::(column, row), - DataType::UInt32 => dict_array_value_to_string::(column, row), - DataType::UInt64 => dict_array_value_to_string::(column, row), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not supported for {:?} due to index type", - column.data_type() - ))), - }, - DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row), - DataType::Struct(_) => { - let st = column - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert struct column to struct array." - .to_string(), - ) - })?; - - let mut s = String::new(); - s.push('{'); - let mut kv_iter = st.columns().iter().zip(st.column_names()); - if let Some((col, name)) = kv_iter.next() { - append_struct_field_string(&mut s, name, col, row)?; - } - for (col, name) in kv_iter { - s.push_str(", "); - append_struct_field_string(&mut s, name, col, row)?; - } - s.push('}'); - - Ok(s) - } - DataType::Map(_, _) => { - let map_array = - column.as_any().downcast_ref::().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert column to map array.".to_string(), - ) - })?; - let map_entry = map_array.value(row); - let st = map_entry - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert map entry to struct array." - .to_string(), - ) - })?; - let mut s = String::new(); - s.push('{'); - let entries_count = st.column(0).len(); - for i in 0..entries_count { - if i > 0 { - s.push_str(", "); - } - append_map_field_string(&mut s, st.column(0), i)?; - s.push_str(": "); - append_map_field_string(&mut s, st.column(1), i)?; - } - s.push('}'); - - Ok(s) - } - DataType::Union(field_vec, type_ids, mode) => { - union_to_string(column, row, field_vec, type_ids, mode) - } - DataType::Duration(unit) => match *unit { - TimeUnit::Second => { - make_string_from_duration!(array::DurationSecondArray, column, row) - } - TimeUnit::Millisecond => { - make_string_from_duration!(array::DurationMillisecondArray, column, row) - } - TimeUnit::Microsecond => { - make_string_from_duration!(array::DurationMicrosecondArray, column, row) - } - TimeUnit::Nanosecond => { - make_string_from_duration!(array::DurationNanosecondArray, column, row) - } - }, - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not implemented for {:?} type", - column.data_type() - ))), - } + let options = FormatOptions::default().with_display_error(true); + let formatter = ArrayFormatter::try_new(column.as_ref(), &options)?; + Ok(formatter.value(row).to_string()) } /// Converts the value of the union array at `row` to a String