diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 711543a18677..a90f086bd732 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -34,13 +34,19 @@ path = "src/lib.rs" bench = false [dependencies] -serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true } +serde = { version = "1.0", default-features = false, features = [ + "derive", + "std", + "rc", +], optional = true } bitflags = { version = "2.0.0", default-features = false, optional = true } -serde_json = "1.0" +serde_json = { version = "1.0", optional = true } [features] +canonical-extension-types = ["dep:serde", "dep:serde_json"] # Enable ffi support ffi = ["bitflags"] +serde = ["dep:serde"] [package.metadata.docs.rs] features = ["ffi"] diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index b7a326f605f3..ae0fd5f33ea6 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -764,298 +764,828 @@ impl DataType { } } -/// The metadata key for the string name identifying the custom data type. -pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; - -/// The metadata key for a serialized representation of the ExtensionType -/// necessary to reconstruct the custom type. -pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; - -/// Extension types. -/// -/// -pub trait ExtensionType: Sized { - /// The name of this extension type. - const NAME: &'static str; - - /// The supported storage types of this extension type. - fn storage_types(&self) -> &[DataType]; - - /// The metadata type of this extension type. - type Metadata; - - /// Returns a reference to the metadata of this extension type, or `None` - /// if this extension type has no metadata. - fn metadata(&self) -> Option<&Self::Metadata>; - - /// Returns the serialized representation of the metadata of this extension - /// type, or `None` if this extension type has no metadata. - fn serialized_metadata(&self) -> Option; - - /// Deserialize this extension type from the serialized representation of the - /// metadata of this extension. An extension type that has no metadata should - /// expect `None` for for the serialized metadata. - fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option; -} - -pub(crate) trait ExtensionTypeExt: ExtensionType { - /// Returns `true` if the given data type is supported by this extension - /// type. - fn supports(&self, data_type: &DataType) -> bool { - self.storage_types().contains(data_type) - } - - /// Try to extract this extension type from the given [`Field`]. - /// - /// This function returns `None` if extension type - /// - information is missing - /// - name does not match - /// - metadata deserialization failed - /// - does not support the data type of this field - fn try_from_field(field: &Field) -> Option { - field - .metadata() - .get(EXTENSION_TYPE_NAME_KEY) - .and_then(|name| { - (name == ::NAME) - .then(|| { - Self::from_serialized_metadata( - field - .metadata() - .get(EXTENSION_TYPE_METADATA_KEY) - .map(String::as_str), - ) - }) - .flatten() - }) - .filter(|extension_type| extension_type.supports(field.data_type())) - } -} - -impl ExtensionTypeExt for T where T: ExtensionType {} - -/// Canonical extension types. -/// -/// The Arrow columnar format allows defining extension types so as to extend -/// standard Arrow data types with custom semantics. Often these semantics will -/// be specific to a system or application. However, it is beneficial to share -/// the definitions of well-known extension types so as to improve -/// interoperability between different systems integrating Arrow columnar data. -pub mod canonical_extension_types { - use serde_json::Value; - - use super::{DataType, ExtensionType}; - - /// Canonical extension types. - #[non_exhaustive] - #[derive(Debug, Clone, PartialEq)] - pub enum CanonicalExtensionTypes { - /// The extension type for 'JSON'. - Json(Json), - /// The extension type for `UUID`. - Uuid(Uuid), - } - - impl From for CanonicalExtensionTypes { - fn from(value: Json) -> Self { - CanonicalExtensionTypes::Json(value) - } - } - - impl From for CanonicalExtensionTypes { - fn from(value: Uuid) -> Self { - CanonicalExtensionTypes::Uuid(value) - } - } - - /// The extension type for `JSON`. - /// - /// Extension name: `arrow.json`. - /// - /// The storage type of this extension is `String` or `LargeString` or - /// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259) - /// is supported. - /// - /// This type does not have any parameters. - /// - /// Metadata is either an empty string or a JSON string with an empty - /// object. In the future, additional fields may be added, but they are not - /// required to interpret the array. - /// - /// - #[derive(Debug, Clone, PartialEq)] - pub struct Json(Value); - - impl Default for Json { - fn default() -> Self { - Self(Value::String("".to_owned())) - } - } - - impl ExtensionType for Json { - const NAME: &'static str = "arrow.json"; - - type Metadata = Value; - - fn storage_types(&self) -> &[DataType] { - &[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] - } - - fn metadata(&self) -> Option<&Self::Metadata> { - Some(&self.0) - } - - fn serialized_metadata(&self) -> Option { - Some(self.0.to_string()) - } - - fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { - serialized_metadata.and_then(|metadata| match metadata { - // Empty string - r#""""# => Some(Default::default()), - // Empty object - value => value - .parse::() - .ok() - .filter(|value| matches!(value.as_object(), Some(map) if map.is_empty())) - .map(Self), - }) - } - } - - /// The extension type for `UUID`. - /// - /// Extension name: `arrow.uuid`. - /// - /// The storage type of the extension is `FixedSizeBinary` with a length of - /// 16 bytes. - /// - /// Note: - /// A specific UUID version is not required or guaranteed. This extension - /// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and - /// does not interpret the bytes in any way. - /// - /// - #[derive(Debug, Default, Clone, Copy, PartialEq)] - pub struct Uuid; - - impl ExtensionType for Uuid { - const NAME: &'static str = "arrow.uuid"; - - type Metadata = (); - - fn storage_types(&self) -> &[DataType] { - &[DataType::FixedSizeBinary(16)] - } - - fn metadata(&self) -> Option<&Self::Metadata> { - None - } - - fn serialized_metadata(&self) -> Option { - None - } - - fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { - serialized_metadata.is_none().then_some(Self) - } - } - - #[cfg(test)] - mod tests { - use std::collections::HashMap; - - use serde_json::Map; - - use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; - - use super::*; - - #[test] - fn json() -> Result<(), ArrowError> { - let mut field = Field::new("", DataType::Utf8, false); - field.try_with_extension_type(Json::default())?; - assert_eq!( - field.metadata().get(EXTENSION_TYPE_METADATA_KEY), - Some(&r#""""#.to_owned()) - ); - assert!(field.extension_type::().is_some()); - - let mut field = Field::new("", DataType::LargeUtf8, false); - field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?; - assert_eq!( - field.metadata().get(EXTENSION_TYPE_METADATA_KEY), - Some(&"{}".to_owned()) - ); - assert!(field.extension_type::().is_some()); - - let mut field = Field::new("", DataType::Utf8View, false); - field.try_with_extension_type(Json::default())?; - assert!(field.extension_type::().is_some()); - assert_eq!( - field.canonical_extension_type(), - Some(CanonicalExtensionTypes::Json(Json::default())) - ); - Ok(()) - } - - #[test] - #[should_panic(expected = "expected Utf8 or LargeUtf8 or Utf8View, found Boolean")] - fn json_bad_type() { - Field::new("", DataType::Boolean, false).with_extension_type(Json::default()); - } - - #[test] - fn json_bad_metadata() { - let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([ - (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), - (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), - ])); - // This returns `None` now because this metadata is invalid. - assert!(field.extension_type::().is_none()); - } - - #[test] - fn json_missing_metadata() { - let field = Field::new("", DataType::LargeUtf8, false).with_metadata( - HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]), - ); - // This returns `None` now because the metadata is missing. - assert!(field.extension_type::().is_none()); - } - - #[test] - fn uuid() -> Result<(), ArrowError> { - let mut field = Field::new("", DataType::FixedSizeBinary(16), false); - field.try_with_extension_type(Uuid)?; - assert!(field.extension_type::().is_some()); - assert_eq!( - field.canonical_extension_type(), - Some(CanonicalExtensionTypes::Uuid(Uuid)) - ); - Ok(()) - } - - #[test] - #[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")] - fn uuid_bad_type() { - Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid); - } - - #[test] - fn uuid_with_metadata() { - // Add metadata that's not expected for uuid. - let field = Field::new("", DataType::FixedSizeBinary(16), false) - .with_metadata(HashMap::from_iter([( - EXTENSION_TYPE_METADATA_KEY.to_owned(), - "".to_owned(), - )])) - .with_extension_type(Uuid); - // This returns `None` now because `Uuid` expects no metadata. - assert!(field.extension_type::().is_none()); - } - } -} +// /// The metadata key for the string name identifying an [`ExtensionType`]. +// pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; + +// /// The metadata key for a serialized representation of the [`ExtensionType`] +// /// necessary to reconstruct the custom type. +// pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; + +// /// Extension types. +// /// +// /// User-defined “extension” types can be defined setting certain key value +// /// pairs in the [`Field`] metadata structure. These extension keys are: +// /// - [`EXTENSION_TYPE_NAME_KEY`] +// /// - [`EXTENSION_TYPE_METADATA_KEY`] +// /// +// /// Canonical extension types support in this crate requires the +// /// `canonical-extension-types` feature. +// /// +// /// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`] +// /// field. +// /// +// /// # Example +// /// +// /// The example below demonstrates how to implement this trait for a `Uuid` +// /// type. Note this is not the canonical extension type for `Uuid`, which does +// /// not include information about the `Uuid` version. +// /// +// /// ``` +// /// # fn main() -> Result<(), ArrowError> { +// /// use crate::ExtensionType; +// /// use std::fmt; +// /// +// /// /// The different Uuid versions. +// /// #[derive(Clone, Copy, Debug, PartialEq)] +// /// enum UuidVersion { +// /// V1, +// /// V2, +// /// V3, +// /// V4, +// /// V5, +// /// V6, +// /// V7, +// /// V8, +// /// } +// /// +// /// // We'll use `Display` to serialize. +// /// impl fmt::Display for UuidVersion { +// /// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +// /// write!( +// /// f, +// /// "{}", +// /// match self { +// /// Self::V1 => "V1", +// /// Self::V2 => "V2", +// /// Self::V3 => "V3", +// /// Self::V4 => "V4", +// /// Self::V5 => "V5", +// /// Self::V6 => "V6", +// /// Self::V7 => "V7", +// /// Self::V8 => "V8", +// /// } +// /// ) +// /// } +// /// } +// /// +// /// // And `FromStr` to deserialize. +// /// impl FromStr for UuidVersion { +// /// type Err = ArrowError; +// /// +// /// fn from_str(s: &str) -> Result { +// /// match s { +// /// "V1" => Ok(Self::V1), +// /// "V2" => Ok(Self::V2), +// /// "V3" => Ok(Self::V3), +// /// "V4" => Ok(Self::V4), +// /// "V5" => Ok(Self::V5), +// /// "V6" => Ok(Self::V6), +// /// "V7" => Ok(Self::V7), +// /// "V8" => Ok(Self::V8), +// /// _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())), +// /// } +// /// } +// /// } +// /// +// /// /// This is the extension type, not the container for Uuid values. It +// /// /// stores the Uuid version (this is the metadata of this extension type). +// /// #[derive(Clone, Copy, Debug, PartialEq)] +// /// struct Uuid(UuidVersion); +// /// +// /// impl ExtensionType for Uuid { +// /// // We use a namespace as suggested by the specification. +// /// const NAME: &str = "myorg.example.uuid"; +// /// +// /// // The metadata type is the Uuid version. +// /// type Metadata = UuidVersion; +// /// +// /// // We just return a reference to the Uuid version. +// /// fn metadata(&self) -> &Self::Metadata { +// /// &self.0 +// /// } +// /// +// /// // We use the `Display` implementation to serialize the Uuid +// /// // version. +// /// fn serialize_metadata(&self) -> Option { +// /// Some(self.0.to_string()) +// /// } +// /// +// /// // We use the `FromStr` implementation to deserialize the Uuid +// /// // version. +// /// fn deserialize_metadata(metadata: Option<&str>) -> Result { +// /// metadata.map_or_else( +// /// || { +// /// Err(ArrowError::InvalidArgumentError( +// /// "Uuid extension type metadata missing".to_owned(), +// /// )) +// /// }, +// /// str::parse, +// /// ) +// /// } +// /// +// /// // The only supported data type is `FixedSizeBinary(16)`. +// /// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +// /// match data_type { +// /// DataType::FixedSizeBinary(16) => Ok(()), +// /// data_type => Err(ArrowError::InvalidArgumentError(format!( +// /// "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}" +// /// ))), +// /// } +// /// } +// /// +// /// // We should always check if the data type is supported before +// /// // constructing the extension type. +// /// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { +// /// let uuid = Self(metadata); +// /// uuid.supports_data_type(data_type)?; +// /// Ok(uuid) +// /// } +// /// } +// /// +// /// // We can now construct the extension type. +// /// let uuid_v1 = Uuid(UuidVersion::V1); +// /// +// /// // And add it to a field. +// /// let mut field = +// /// Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1); +// /// +// /// // And extract it from this field. +// /// assert_eq!(field.extension_type::()?, uuid_v1); +// /// +// /// // When we try to add this to a field with an unsupported data type we +// /// // get an error. +// /// let result = Field::new("", DataType::Boolean, false).try_with_extension_type(uuid_v1); +// /// assert!(result.is_err()); +// /// # Ok(()) } +// /// ``` +// /// +// /// +// pub trait ExtensionType: Sized { +// /// The name identifying this extension type. +// /// +// /// This is the string value that is used for the +// /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`] +// /// to identify this extension type. +// /// +// /// We recommend that you use a “namespace”-style prefix for extension +// /// type names to minimize the possibility of conflicts with multiple Arrow +// /// readers and writers in the same application. For example, use +// /// `myorg.name_of_type` instead of simply `name_of_type`. +// /// +// /// Extension names beginning with `arrow.` are reserved for canonical +// /// extension types, they should not be used for third-party extension +// /// types. +// const NAME: &str; + +// /// The metadata type of this extension type. +// /// +// /// If an extension type defines no metadata it should use `()` to indicate +// /// this. +// type Metadata; + +// /// Returns a reference to the metadata of this extension type, or `&()` if +// /// if this extension type defines no metadata (`Self::Metadata=()`). +// fn metadata(&self) -> &Self::Metadata; + +// /// Returns the serialized representation of the metadata of this extension +// /// type, or `None` if this extension type defines no metadata +// /// (`Self::Metadata=()`). +// /// +// /// This is string value that is used for the +// /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a +// /// [`Field`]. +// fn serialize_metadata(&self) -> Option; + +// /// Deserialize the metadata of this extension type from the serialized +// /// representation of the metadata. An extension type that defines no +// /// metadata should expect `None` for the serialized metadata and return +// /// `Ok(())`. +// /// +// /// This function should return an error when +// /// - expected metadata is missing (for extensions types with non-optional +// /// metadata) +// /// - unexpected metadata is set (for extension types without metadata) +// /// - deserialization of metadata fails +// fn deserialize_metadata(metadata: Option<&str>) -> Result; + +// /// Returns `OK())` iff the given data type is supported by this extension +// /// type. +// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>; + +// /// Construct this extension type for a field with the given data type and +// /// metadata. +// /// +// /// This should return an error if the given data type is not supported by +// /// this extension type. +// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result; +// } + +// /// Canonical extension types. +// /// +// /// The Arrow columnar format allows defining extension types so as to extend +// /// standard Arrow data types with custom semantics. Often these semantics will +// /// be specific to a system or application. However, it is beneficial to share +// /// the definitions of well-known extension types so as to improve +// /// interoperability between different systems integrating Arrow columnar data. +// /// +// /// +// #[cfg(feature = "canonical-extension-types")] +// pub mod canonical_extension_types { +// use std::sync::Arc; + +// use serde::{Deserialize, Serialize}; +// use serde_json::Value; + +// use crate::{ArrowError, Field, Fields}; + +// use super::{DataType, ExtensionType}; + +// /// Canonical extension types. +// /// +// /// +// #[non_exhaustive] +// #[derive(Debug, Clone, PartialEq)] +// pub enum CanonicalExtensionType { +// /// The extension type for `FixedShapeTensor`. +// /// +// /// +// FixedShapeTensor(FixedShapeTensor), + +// /// The extension type for `VariableShapeTensor`. +// /// +// /// +// VariableShapeTensor(VariableShapeTensor), + +// /// The extension type for 'JSON'. +// /// +// /// +// Json(Json), + +// /// The extension type for `UUID`. +// /// +// /// +// Uuid(Uuid), +// } + +// impl From for CanonicalExtensionType { +// fn from(value: FixedShapeTensor) -> Self { +// CanonicalExtensionType::FixedShapeTensor(value) +// } +// } + +// impl From for CanonicalExtensionType { +// fn from(value: VariableShapeTensor) -> Self { +// CanonicalExtensionType::VariableShapeTensor(value) +// } +// } + +// impl From for CanonicalExtensionType { +// fn from(value: Json) -> Self { +// CanonicalExtensionType::Json(value) +// } +// } + +// impl From for CanonicalExtensionType { +// fn from(value: Uuid) -> Self { +// CanonicalExtensionType::Uuid(value) +// } +// } + +// /// The extension type for fixed shape tensor. +// /// +// /// Extension name: `arrow.fixed_shape_tensor`. +// /// +// /// The storage type of the extension: `FixedSizeList` where: +// /// - `value_type` is the data type of individual tensor elements. +// /// - `list_size` is the product of all the elements in tensor shape. +// /// +// /// Extension type parameters: +// /// - `value_type`: the Arrow data type of individual tensor elements. +// /// - `shape`: the physical shape of the contained tensors as an array. +// /// +// /// Optional parameters describing the logical layout: +// /// - `dim_names`: explicit names to tensor dimensions as an array. The +// /// length of it should be equal to the shape length and equal to the +// /// number of dimensions. +// /// `dim_names` can be used if the dimensions have +// /// well-known names and they map to the physical layout (row-major). +// /// - `permutation`: indices of the desired ordering of the original +// /// dimensions, defined as an array. +// /// The indices contain a permutation of the values `[0, 1, .., N-1]` +// /// where `N` is the number of dimensions. The permutation indicates +// /// which dimension of the logical layout corresponds to which dimension +// /// of the physical tensor (the i-th dimension of the logical view +// /// corresponds to the dimension with number `permutations[i]` of the +// /// physical tensor). +// /// Permutation can be useful in case the logical order of the tensor is +// /// a permutation of the physical order (row-major). +// /// When logical and physical layout are equal, the permutation will +// /// always be `([0, 1, .., N-1])` and can therefore be left out. +// /// +// /// Description of the serialization: +// /// The metadata must be a valid JSON object including shape of the +// /// contained tensors as an array with key `shape` plus optional +// /// dimension names with keys `dim_names` and ordering of the +// /// dimensions with key `permutation`. +// /// Example: `{ "shape": [2, 5]}` +// /// Example with `dim_names` metadata for NCHW ordered data: +// /// `{ "shape": [100, 200, 500], "dim_names": ["C", "H", "W"]}` +// /// Example of permuted 3-dimensional tensor: +// /// `{ "shape": [100, 200, 500], "permutation": [2, 0, 1]}` +// /// +// /// This is the physical layout shape and the shape of the logical layout +// /// would in this case be `[500, 100, 200]`. +// /// +// /// +// #[derive(Debug, Clone, PartialEq)] +// pub struct FixedShapeTensor { +// /// The data type of individual tensor elements. +// value_type: DataType, + +// /// The metadata of this extension type. +// metadata: FixedShapeTensorMetadata, +// } + +// impl FixedShapeTensor { +// /// Returns the value type of the individual tensor elements. +// pub fn value_type(&self) -> &DataType { +// &self.value_type +// } + +// /// Returns the product of all the elements in tensor shape. +// pub fn list_size(&self) -> usize { +// self.metadata.list_size() +// } + +// /// Returns the number of dimensions in this fixed shape tensor. +// pub fn dimensions(&self) -> usize { +// self.metadata.dimensions() +// } + +// /// Returns the names of the dimensions in this fixed shape tensor, if +// /// set. +// pub fn dimension_names(&self) -> Option<&[String]> { +// self.metadata.dimension_names() +// } + +// /// Returns the indices of the desired ordering of the original +// /// dimensions, if set. +// pub fn permutations(&self) -> Option<&[usize]> { +// self.metadata.permutations() +// } +// } + +// /// Extension type metadata for [`FixedShapeTensor`]. +// #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +// pub struct FixedShapeTensorMetadata { +// /// The physical shape of the contained tensors. +// shape: Vec, + +// /// Explicit names to tensor dimensions. +// dim_names: Option>, + +// /// Indices of the desired ordering of the original dimensions. +// permutations: Option>, +// } + +// impl FixedShapeTensorMetadata { +// /// Returns the product of all the elements in tensor shape. +// pub fn list_size(&self) -> usize { +// self.shape.iter().product() +// } + +// /// Returns the number of dimensions in this fixed shape tensor. +// pub fn dimensions(&self) -> usize { +// self.shape.len() +// } + +// /// Returns the names of the dimensions in this fixed shape tensor, if +// /// set. +// pub fn dimension_names(&self) -> Option<&[String]> { +// self.dim_names.as_ref().map(AsRef::as_ref) +// } + +// /// Returns the indices of the desired ordering of the original +// /// dimensions, if set. +// pub fn permutations(&self) -> Option<&[usize]> { +// self.permutations.as_ref().map(AsRef::as_ref) +// } +// } + +// impl ExtensionType for FixedShapeTensor { +// const NAME: &str = "arrow.fixed_shape_tensor"; + +// type Metadata = FixedShapeTensorMetadata; + +// fn metadata(&self) -> &Self::Metadata { +// &self.metadata +// } + +// fn serialize_metadata(&self) -> Option { +// Some(serde_json::to_string(&self.metadata).expect("metadata serialization")) +// } + +// fn deserialize_metadata(metadata: Option<&str>) -> Result { +// metadata.map_or_else( +// || { +// Err(ArrowError::InvalidArgumentError( +// "FixedShapeTensor extension types requires metadata".to_owned(), +// )) +// }, +// |value| { +// serde_json::from_str(value).map_err(|e| { +// ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor deserialization failed: {e}" +// )) +// }) +// }, +// ) +// } + +// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +// let expected = DataType::FixedSizeList( +// Arc::new(Field::new("", self.value_type.clone(), false)), +// i32::try_from(self.list_size()).expect("overflow"), +// ); + +// data_type +// .equals_datatype(&expected) +// .then_some(()) +// .ok_or_else(|| { +// ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor data type mismatch, expected {expected}, found {data_type}" +// )) +// }) +// } + +// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { +// match data_type { +// DataType::FixedSizeList(field, list_size) => { +// if field.is_nullable() { +// return Err(ArrowError::InvalidArgumentError( +// "FixedShapeTensor data type mismatch, expected non-nullable field, found nullable field".to_owned(), +// )); +// } +// // Make sure the shape matches +// let expected_size = i32::try_from(metadata.list_size()).expect("overflow"); +// if *list_size != expected_size { +// return Err(ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor list size mismatch, expected {expected_size} (metadata), found {list_size} (data type)" +// ))); +// } +// // Make sure the dim names size is correct, if set. +// if let Some(dim_names_size) = metadata.dimension_names().map(<[_]>::len) { +// let expected_size = metadata.dimensions(); +// if dim_names_size != expected_size { +// return Err(ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor dimension names size mismatch, expected {expected_size}, found {dim_names_size}" +// ))); +// } +// } +// // Make sure the permutations are correct, if set. +// if let Some(permutations) = metadata.permutations() { +// let expected_size = metadata.dimensions(); +// if permutations.len() != expected_size { +// return Err(ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor permutations size mismatch, expected {expected_size}, found {}", +// permutations.len() +// ))); +// } +// // Check if the permutations are valid. +// let mut permutations = permutations.to_vec(); +// permutations.sort_unstable(); +// let dimensions = metadata.dimensions(); +// if (0..dimensions).zip(permutations).any(|(a, b)| a != b) { +// return Err(ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor permutations invalid, expected a permutation of [0, 1, .., N-1], where N is the number of dimensions: {dimensions}" +// ))); +// } +// } + +// Ok(Self { +// value_type: field.data_type().clone(), +// metadata, +// }) +// } +// data_type => Err(ArrowError::InvalidArgumentError(format!( +// "FixedShapeTensor data type mismatch, expected FixedSizeList, found {data_type}" +// ))), +// } +// } +// } + +// /// The extension type for `VariableShapeTensor`. +// /// +// /// +// /// +// /// +// #[derive(Debug, Clone, PartialEq)] +// pub struct VariableShapeTensor { +// /// The data type of individual tensor elements. +// value_type: DataType, + +// /// The metadata of this extension type. +// metadata: VariableShapeTensorMetadata, +// } + +// impl VariableShapeTensor {} + +// /// Extension type metadata for [`VariableShapeTensor`]. +// #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +// pub struct VariableShapeTensorMetadata { +// /// Explicit names to tensor dimensions. +// dim_names: Option>, + +// /// Indices of the desired ordering of the original dimensions. +// permutations: Option>, +// } + +// impl ExtensionType for VariableShapeTensor { +// const NAME: &str = "arrow.variable_shape_tensor"; + +// type Metadata = VariableShapeTensorMetadata; + +// fn metadata(&self) -> &Self::Metadata { +// &self.metadata +// } + +// fn serialize_metadata(&self) -> Option { +// Some(serde_json::to_string(&self.metadata).expect("metadata serialization")) +// } + +// fn deserialize_metadata(metadata: Option<&str>) -> Result { +// metadata.map_or_else( +// || { +// Err(ArrowError::InvalidArgumentError( +// "VariableShapeTensor extension types requires metadata".to_owned(), +// )) +// }, +// |value| { +// serde_json::from_str(value).map_err(|e| { +// ArrowError::InvalidArgumentError(format!( +// "VariableShapeTensor deserialization failed: {e}" +// )) +// }) +// }, +// ) +// } + +// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +// let expected = DataType::Struct( +// [ +// Field::new( +// "data", +// DataType::List(Arc::new(Field::new_list_field( +// self.value_type.clone(), +// false, +// ))), +// false, +// ), +// Field::new( +// "shape", +// DataType::new_fixed_size_list(DataType::Int32, self.dimensions(), false), +// false, +// ), +// ] +// .into_iter() +// .map(Arc::new) +// .collect(), +// ); + +// data_type +// .equals_datatype(&expected) +// .then_some(()) +// .ok_or_else(|| { +// ArrowError::InvalidArgumentError(format!( +// "VariableShapeTensor data type mismatch, expected {expected}, found {data_type}" +// )) +// }) +// } + +// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { +// todo!() +// } +// } + +// /// The extension type for `JSON`. +// /// +// /// Extension name: `arrow.json`. +// /// +// /// The storage type of this extension is `String` or `LargeString` or +// /// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259) +// /// is supported. +// /// +// /// This type does not have any parameters. +// /// +// /// Metadata is either an empty string or a JSON string with an empty +// /// object. In the future, additional fields may be added, but they are not +// /// required to interpret the array. +// /// +// /// +// #[derive(Debug, Clone, PartialEq)] +// pub struct Json(Value); + +// impl Default for Json { +// fn default() -> Self { +// Self(Value::String(Default::default())) +// } +// } + +// impl ExtensionType for Json { +// const NAME: &str = "arrow.json"; + +// type Metadata = Value; + +// fn metadata(&self) -> &Self::Metadata { +// &self.0 +// } + +// fn serialize_metadata(&self) -> Option { +// Some(self.0.to_string()) +// } + +// fn deserialize_metadata(metadata: Option<&str>) -> Result { +// const ERR: &str = "Json extension type metadata is either an empty string or a JSON string with an empty object"; +// metadata.map_or_else( +// || Err(ArrowError::InvalidArgumentError(ERR.to_owned())), +// |metadata| match metadata { +// r#""""# => Ok(Value::String(Default::default())), +// value => value +// .parse::() +// .ok() +// .filter(|value| matches!(value.as_object(), Some(map) if map.is_empty())) +// .ok_or_else(|| ArrowError::InvalidArgumentError(ERR.to_owned())), +// }, +// ) +// } + +// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +// match data_type { +// DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Ok(()), +// data_type => Err(ArrowError::InvalidArgumentError(format!( +// "Json data type mismatch, expected one of Utf8, LargeUtf8, Utf8View, found {data_type}" +// ))), +// } +// } + +// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { +// let json = Self(metadata); +// json.supports_data_type(data_type)?; +// Ok(json) +// } +// } + +// /// The extension type for `UUID`. +// /// +// /// Extension name: `arrow.uuid`. +// /// +// /// The storage type of the extension is `FixedSizeBinary` with a length of +// /// 16 bytes. +// /// +// /// Note: +// /// A specific UUID version is not required or guaranteed. This extension +// /// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and +// /// does not interpret the bytes in any way. +// /// +// /// +// #[derive(Debug, Default, Clone, Copy, PartialEq)] +// pub struct Uuid; + +// impl ExtensionType for Uuid { +// const NAME: &str = "arrow.uuid"; + +// type Metadata = (); + +// fn metadata(&self) -> &Self::Metadata { +// &() +// } + +// fn serialize_metadata(&self) -> Option { +// None +// } + +// fn deserialize_metadata(metadata: Option<&str>) -> Result { +// metadata.map_or_else( +// || Ok(()), +// |_| { +// Err(ArrowError::InvalidArgumentError( +// "Uuid extension type expects no metadata".to_owned(), +// )) +// }, +// ) +// } + +// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +// match data_type { +// DataType::FixedSizeBinary(16) => Ok(()), +// data_type => Err(ArrowError::InvalidArgumentError(format!( +// "Uuid data type mismatch,expected FixedSizeBinary(16), found {data_type}" +// ))), +// } +// } + +// fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { +// let uuid = Self; +// Self.supports_data_type(data_type)?; +// Ok(uuid) +// } +// } + +// #[cfg(test)] +// mod tests { +// use std::collections::HashMap; + +// use serde_json::Map; + +// use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; + +// use super::*; + +// #[test] +// fn json() -> Result<(), ArrowError> { +// let mut field = Field::new("", DataType::Utf8, false); +// field.try_with_extension_type(Json::default())?; +// assert_eq!( +// field.metadata().get(EXTENSION_TYPE_METADATA_KEY), +// Some(&r#""""#.to_owned()) +// ); +// assert!(field.try_extension_type::().is_ok()); + +// let mut field = Field::new("", DataType::LargeUtf8, false); +// field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?; +// assert_eq!( +// field.metadata().get(EXTENSION_TYPE_METADATA_KEY), +// Some(&"{}".to_owned()) +// ); +// assert!(field.try_extension_type::().is_ok()); + +// let mut field = Field::new("", DataType::Utf8View, false); +// field.try_with_extension_type(Json::default())?; +// assert!(field.try_extension_type::().is_ok()); +// assert_eq!( +// field.try_canonical_extension_type().unwrap(), +// CanonicalExtensionType::Json(Json::default()) +// ); +// Ok(()) +// } + +// #[test] +// #[should_panic(expected = "expected one of Utf8, LargeUtf8, Utf8View, found Boolean")] +// fn json_bad_type() { +// Field::new("", DataType::Boolean, false).with_extension_type(Json::default()); +// } + +// #[test] +// fn json_bad_metadata() { +// let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([ +// (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), +// (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), +// ])); +// // This returns `None` now because this metadata is invalid. +// assert!(field.try_extension_type::().is_err()); +// } + +// #[test] +// fn json_missing_metadata() { +// let field = Field::new("", DataType::LargeUtf8, false).with_metadata( +// HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]), +// ); +// // This returns `None` now because the metadata is missing. +// assert!(field.try_extension_type::().is_err()); +// } + +// #[test] +// fn uuid() -> Result<(), ArrowError> { +// let mut field = Field::new("", DataType::FixedSizeBinary(16), false); +// field.try_with_extension_type(Uuid)?; +// assert!(field.try_extension_type::().is_ok()); +// assert_eq!( +// field.try_canonical_extension_type().unwrap(), +// CanonicalExtensionType::Uuid(Uuid) +// ); +// Ok(()) +// } + +// #[test] +// #[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")] +// fn uuid_bad_type() { +// Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid); +// } + +// #[test] +// fn uuid_with_metadata() { +// // Add metadata that's not expected for uuid. +// let field = Field::new("", DataType::FixedSizeBinary(16), false) +// .with_metadata(HashMap::from_iter([( +// EXTENSION_TYPE_METADATA_KEY.to_owned(), +// "".to_owned(), +// )])) +// .with_extension_type(Uuid); +// // This returns an error now because `Uuid` expects no metadata. +// assert!(field.try_extension_type::().is_err()); +// } +// } +// } /// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; diff --git a/arrow-schema/src/extension/canonical/bool8.rs b/arrow-schema/src/extension/canonical/bool8.rs new file mode 100644 index 000000000000..0272752f35a8 --- /dev/null +++ b/arrow-schema/src/extension/canonical/bool8.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! 8-bit Boolean +//! +//! + +use crate::{extension::ExtensionType, ArrowError, DataType}; + +/// The extension type for `8-bit Boolean`. +/// +/// Extension name: `arrow.bool8`. +/// +/// The storage type of the extension is `Int8` where: +/// - false is denoted by the value 0. +/// - true can be specified using any non-zero value. Preferably 1. +/// +/// +#[derive(Debug, Default, Clone, Copy, PartialEq)] +pub struct Bool8; + +impl ExtensionType for Bool8 { + const NAME: &str = "arrow.bool8"; + + type Metadata = &'static str; + + fn metadata(&self) -> &Self::Metadata { + &"" + } + + fn serialize_metadata(&self) -> Option { + Some(String::default()) + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + const ERR: &str = "Bool8 extension type expects an empty string as metadata"; + metadata.map_or_else( + || Err(ArrowError::InvalidArgumentError(ERR.to_owned())), + |value| match value { + "" => Ok(""), + _ => Err(ArrowError::InvalidArgumentError(ERR.to_owned())), + }, + ) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + match data_type { + DataType::Int8 => Ok(()), + data_type => Err(ArrowError::InvalidArgumentError(format!( + "Bool8 data type mismatch, expected Int8, found {data_type}" + ))), + } + } + + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + Self.supports_data_type(data_type).map(|_| Self) + } +} diff --git a/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs new file mode 100644 index 000000000000..abbfe3f6978f --- /dev/null +++ b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! FixedShapeTensor +//! +//! + +use serde::{Deserialize, Serialize}; + +use crate::{extension::ExtensionType, ArrowError, DataType}; + +/// The extension type for fixed shape tensor. +/// +/// Extension name: `arrow.fixed_shape_tensor`. +/// +/// The storage type of the extension: `FixedSizeList` where: +/// - `value_type` is the data type of individual tensor elements. +/// - `list_size` is the product of all the elements in tensor shape. +/// +/// Extension type parameters: +/// - `value_type`: the Arrow data type of individual tensor elements. +/// - `shape`: the physical shape of the contained tensors as an array. +/// +/// Optional parameters describing the logical layout: +/// - `dim_names`: explicit names to tensor dimensions as an array. The +/// length of it should be equal to the shape length and equal to the +/// number of dimensions. +/// `dim_names` can be used if the dimensions have +/// well-known names and they map to the physical layout (row-major). +/// - `permutation`: indices of the desired ordering of the original +/// dimensions, defined as an array. +/// The indices contain a permutation of the values `[0, 1, .., N-1]` +/// where `N` is the number of dimensions. The permutation indicates +/// which dimension of the logical layout corresponds to which dimension +/// of the physical tensor (the i-th dimension of the logical view +/// corresponds to the dimension with number `permutations[i]` of the +/// physical tensor). +/// Permutation can be useful in case the logical order of the tensor is +/// a permutation of the physical order (row-major). +/// When logical and physical layout are equal, the permutation will +/// always be `([0, 1, .., N-1])` and can therefore be left out. +/// +/// Description of the serialization: +/// The metadata must be a valid JSON object including shape of the +/// contained tensors as an array with key `shape` plus optional +/// dimension names with keys `dim_names` and ordering of the +/// dimensions with key `permutation`. +/// Example: `{ "shape": [2, 5]}` +/// Example with `dim_names` metadata for NCHW ordered data: +/// `{ "shape": [100, 200, 500], "dim_names": ["C", "H", "W"]}` +/// Example of permuted 3-dimensional tensor: +/// `{ "shape": [100, 200, 500], "permutation": [2, 0, 1]}` +/// +/// This is the physical layout shape and the shape of the logical layout +/// would in this case be `[500, 100, 200]`. +/// +/// +#[derive(Debug, Clone, PartialEq)] +pub struct FixedShapeTensor { + /// The data type of individual tensor elements. + value_type: DataType, + + /// The metadata of this extension type. + metadata: FixedShapeTensorMetadata, +} + +impl FixedShapeTensor { + /// Returns a new fixed shape tensor extension type. + /// + /// # Error + /// + /// Return an error if the provided dimension names or permutations are + /// invalid. + pub fn try_new( + _value_type: DataType, + _shape: impl IntoIterator, + _dimension_names: Option>>, + _permutations: Option>>, + ) -> Result { + todo!() + } + + /// Returns the value type of the individual tensor elements. + pub fn value_type(&self) -> &DataType { + &self.value_type + } + + /// Returns the product of all the elements in tensor shape. + pub fn list_size(&self) -> usize { + self.metadata.list_size() + } + + /// Returns the number of dimensions in this fixed shape tensor. + pub fn dimensions(&self) -> usize { + self.metadata.dimensions() + } + + /// Returns the names of the dimensions in this fixed shape tensor, if + /// set. + pub fn dimension_names(&self) -> Option<&[String]> { + self.metadata.dimension_names() + } + + /// Returns the indices of the desired ordering of the original + /// dimensions, if set. + pub fn permutations(&self) -> Option<&[usize]> { + self.metadata.permutations() + } +} + +/// Extension type metadata for [`FixedShapeTensor`]. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct FixedShapeTensorMetadata { + /// The physical shape of the contained tensors. + shape: Vec, + + /// Explicit names to tensor dimensions. + dim_names: Option>, + + /// Indices of the desired ordering of the original dimensions. + permutations: Option>, +} + +impl FixedShapeTensorMetadata { + /// Returns the product of all the elements in tensor shape. + pub fn list_size(&self) -> usize { + self.shape.iter().product() + } + + /// Returns the number of dimensions in this fixed shape tensor. + pub fn dimensions(&self) -> usize { + self.shape.len() + } + + /// Returns the names of the dimensions in this fixed shape tensor, if + /// set. + pub fn dimension_names(&self) -> Option<&[String]> { + self.dim_names.as_ref().map(AsRef::as_ref) + } + + /// Returns the indices of the desired ordering of the original + /// dimensions, if set. + pub fn permutations(&self) -> Option<&[usize]> { + self.permutations.as_ref().map(AsRef::as_ref) + } +} + +impl ExtensionType for FixedShapeTensor { + const NAME: &str = "arrow.fixed_shape_tensor"; + + type Metadata = FixedShapeTensorMetadata; + + fn metadata(&self) -> &Self::Metadata { + &self.metadata + } + + fn serialize_metadata(&self) -> Option { + Some(serde_json::to_string(&self.metadata).expect("metadata serialization")) + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + metadata.map_or_else( + || { + Err(ArrowError::InvalidArgumentError( + "FixedShapeTensor extension types requires metadata".to_owned(), + )) + }, + |value| { + serde_json::from_str(value).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor metadata deserialization failed: {e}" + )) + }) + }, + ) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + let expected = DataType::new_fixed_size_list( + self.value_type.clone(), + i32::try_from(self.list_size()).expect("overflow"), + false, + ); + data_type + .equals_datatype(&expected) + .then_some(()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor data type mismatch, expected {expected}, found {data_type}" + )) + }) + } + + fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { + match data_type { + DataType::FixedSizeList(field, list_size) if !field.is_nullable() => { + // Make sure the shape matches + let expected_size = i32::try_from(metadata.list_size()).expect("overflow"); + if *list_size != expected_size { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor list size mismatch, expected {expected_size} (metadata), found {list_size} (data type)" + ))); + } + // Make sure the dim names size is correct, if set. + if let Some(dim_names_size) = metadata.dimension_names().map(<[_]>::len) { + let expected_size = metadata.dimensions(); + if dim_names_size != expected_size { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor dimension names size mismatch, expected {expected_size}, found {dim_names_size}" + ))); + } + } + // Make sure the permutations are correct, if set. + if let Some(permutations) = metadata.permutations() { + let expected_size = metadata.dimensions(); + if permutations.len() != expected_size { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor permutations size mismatch, expected {expected_size}, found {}", + permutations.len() + ))); + } + // Check if the permutations are valid. + let mut permutations = permutations.to_vec(); + permutations.sort_unstable(); + let dimensions = metadata.dimensions(); + if (0..dimensions).zip(permutations).any(|(a, b)| a != b) { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor permutations invalid, expected a permutation of [0, 1, .., N-1], where N is the number of dimensions: {dimensions}" + ))); + } + } + + Ok(Self { + value_type: field.data_type().clone(), + metadata, + }) + } + data_type => Err(ArrowError::InvalidArgumentError(format!( + "FixedShapeTensor data type mismatch, expected FixedSizeList with non-nullable field, found {data_type}" + ))), + } + } +} diff --git a/arrow-schema/src/extension/canonical/json.rs b/arrow-schema/src/extension/canonical/json.rs new file mode 100644 index 000000000000..8e89d611c5ff --- /dev/null +++ b/arrow-schema/src/extension/canonical/json.rs @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! JSON +//! +//! + +use serde_json::Value; + +use crate::{extension::ExtensionType, ArrowError, DataType}; + +/// The extension type for `JSON`. +/// +/// Extension name: `arrow.json`. +/// +/// The storage type of this extension is `String` or `LargeString` or +/// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259) +/// is supported. +/// +/// This type does not have any parameters. +/// +/// Metadata is either an empty string or a JSON string with an empty +/// object. In the future, additional fields may be added, but they are not +/// required to interpret the array. +/// +/// +#[derive(Debug, Clone, Default, PartialEq)] +pub struct Json(JsonMetadata); + +/// Extension type metadata for [`Json`]. +#[derive(Debug, Clone, PartialEq)] +pub struct JsonMetadata(Value); + +impl Default for JsonMetadata { + fn default() -> Self { + Self(Value::String(Default::default())) + } +} + +impl ExtensionType for Json { + const NAME: &str = "arrow.json"; + + type Metadata = JsonMetadata; + + fn metadata(&self) -> &Self::Metadata { + &self.0 + } + + fn serialize_metadata(&self) -> Option { + Some(self.metadata().0.to_string()) + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + const ERR: &str = "Json extension type metadata is either an empty string or a JSON string with an empty object"; + metadata + .map_or_else( + || Err(ArrowError::InvalidArgumentError(ERR.to_owned())), + |metadata| match metadata { + r#""""# => Ok(Value::String(Default::default())), + value => value + .parse::() + .ok() + .filter(|value| matches!(value.as_object(), Some(map) if map.is_empty())) + .ok_or_else(|| ArrowError::InvalidArgumentError(ERR.to_owned())), + }, + ) + .map(JsonMetadata) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + match data_type { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Ok(()), + data_type => Err(ArrowError::InvalidArgumentError(format!( + "Json data type mismatch, expected one of Utf8, LargeUtf8, Utf8View, found {data_type}" + ))), + } + } + + fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { + let json = Self(metadata); + json.supports_data_type(data_type)?; + Ok(json) + } +} + +#[cfg(test)] +mod tests { + use serde_json::Map; + + use crate::{ + extension::{CanonicalExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}, + Field, + }; + + use super::*; + + #[test] + fn json() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::Utf8, false); + field.try_with_extension_type(Json::default())?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&r#""""#.to_owned()) + ); + assert!(field.try_extension_type::().is_ok()); + + let mut field = Field::new("", DataType::LargeUtf8, false); + field.try_with_extension_type(Json(JsonMetadata(serde_json::Value::Object( + Map::default(), + ))))?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&"{}".to_owned()) + ); + assert!(field.try_extension_type::().is_ok()); + + let mut field = Field::new("", DataType::Utf8View, false); + field.try_with_extension_type(Json::default())?; + assert!(field.try_extension_type::().is_ok()); + assert_eq!( + field.try_canonical_extension_type().unwrap(), + CanonicalExtensionType::Json(Json::default()) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected one of Utf8, LargeUtf8, Utf8View, found Null")] + fn json_bad_type() { + Field::new("", DataType::Null, false).with_extension_type(Json::default()); + } + + #[test] + fn json_bad_metadata() { + let field = Field::new("", DataType::Utf8, false).with_metadata( + [ + (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), + (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), + ] + .into_iter() + .collect(), + ); + // This returns `None` now because this metadata is invalid. + assert!(field.try_extension_type::().is_err()); + } + + #[test] + fn json_missing_metadata() { + let field = Field::new("", DataType::LargeUtf8, false).with_metadata( + [(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())] + .into_iter() + .collect(), + ); + // This returns `None` now because the metadata is missing. + assert!(field.try_extension_type::().is_err()); + } +} diff --git a/arrow-schema/src/extension/canonical/mod.rs b/arrow-schema/src/extension/canonical/mod.rs new file mode 100644 index 000000000000..778da9ef2c46 --- /dev/null +++ b/arrow-schema/src/extension/canonical/mod.rs @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Canonical extension types. +//! +//! The Arrow columnar format allows defining extension types so as to extend +//! standard Arrow data types with custom semantics. Often these semantics will +//! be specific to a system or application. However, it is beneficial to share +//! the definitions of well-known extension types so as to improve +//! interoperability between different systems integrating Arrow columnar data. +//! +//! + +mod bool8; +pub use bool8::Bool8; +mod fixed_shape_tensor; +pub use fixed_shape_tensor::{FixedShapeTensor, FixedShapeTensorMetadata}; +mod json; +pub use json::{Json, JsonMetadata}; +mod opaque; +pub use opaque::{Opaque, OpaqueMetadata}; +mod uuid; +pub use uuid::Uuid; +mod variable_shape_tensor; +pub use variable_shape_tensor::{VariableShapeTensor, VariableShapeTensorMetadata}; + +/// Canonical extension types. +/// +/// +#[non_exhaustive] +#[derive(Debug, Clone, PartialEq)] +pub enum CanonicalExtensionType { + /// The extension type for `FixedShapeTensor`. + /// + /// + FixedShapeTensor(FixedShapeTensor), + + /// The extension type for `VariableShapeTensor`. + /// + /// + VariableShapeTensor(VariableShapeTensor), + + /// The extension type for 'JSON'. + /// + /// + Json(Json), + + /// The extension type for `UUID`. + /// + /// + Uuid(Uuid), + + /// The extension type for `Opaque`. + /// + /// + Opaque(Opaque), +} + +impl From for CanonicalExtensionType { + fn from(value: FixedShapeTensor) -> Self { + CanonicalExtensionType::FixedShapeTensor(value) + } +} + +impl From for CanonicalExtensionType { + fn from(value: VariableShapeTensor) -> Self { + CanonicalExtensionType::VariableShapeTensor(value) + } +} + +impl From for CanonicalExtensionType { + fn from(value: Json) -> Self { + CanonicalExtensionType::Json(value) + } +} + +impl From for CanonicalExtensionType { + fn from(value: Uuid) -> Self { + CanonicalExtensionType::Uuid(value) + } +} + +impl From for CanonicalExtensionType { + fn from(value: Opaque) -> Self { + CanonicalExtensionType::Opaque(value) + } +} diff --git a/arrow-schema/src/extension/canonical/opaque.rs b/arrow-schema/src/extension/canonical/opaque.rs new file mode 100644 index 000000000000..283ac1e486d6 --- /dev/null +++ b/arrow-schema/src/extension/canonical/opaque.rs @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Opaque +//! +//! + +use serde::{Deserialize, Serialize}; + +use crate::{extension::ExtensionType, ArrowError, DataType}; + +/// The extension type for `Opaque`. +/// +/// Extension name: `arrow.opaque`. +/// +/// Opaque represents a type that an Arrow-based system received from an +/// external (often non-Arrow) system, but that it cannot interpret. In this +/// case, it can pass on Opaque to its clients to at least show that a field +/// exists and preserve metadata about the type from the other system. +/// +/// The storage type of this extension is any type. If there is no underlying +/// data, the storage type should be Null. +#[derive(Debug, Clone, PartialEq)] +pub struct Opaque(OpaqueMetadata); + +impl Opaque { + /// Returns a new `Opaque` extension type. + pub fn new(type_name: impl Into, vendor_name: impl Into) -> Self { + Self(OpaqueMetadata::new(type_name, vendor_name)) + } + + /// Returns the name of the unknown type in the external system. + pub fn type_name(&self) -> &str { + &self.0.type_name() + } + + /// Returns the name of the external system. + pub fn vendor_name(&self) -> &str { + &self.0.vendor_name() + } +} + +impl From for Opaque { + fn from(value: OpaqueMetadata) -> Self { + Self(value) + } +} + +/// Extension type metadata for [`Opaque`]. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct OpaqueMetadata { + /// Name of the unknown type in the external system. + type_name: String, + + /// Name of the external system. + vendor_name: String, +} + +impl OpaqueMetadata { + /// Returns a new `OpaqueMetadata`. + pub fn new(type_name: impl Into, vendor_name: impl Into) -> Self { + OpaqueMetadata { + type_name: type_name.into(), + vendor_name: vendor_name.into(), + } + } + + /// Returns the name of the unknown type in the external system. + pub fn type_name(&self) -> &str { + &self.type_name + } + + /// Returns the name of the external system. + pub fn vendor_name(&self) -> &str { + &self.vendor_name + } +} + +impl ExtensionType for Opaque { + const NAME: &str = "arrow.opaque"; + + type Metadata = OpaqueMetadata; + + fn metadata(&self) -> &Self::Metadata { + &self.0 + } + + fn serialize_metadata(&self) -> Option { + Some(serde_json::to_string(self.metadata()).expect("metadata serialization")) + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + metadata.map_or_else( + || { + Err(ArrowError::InvalidArgumentError( + "Opaque extension types requires metadata".to_owned(), + )) + }, + |value| { + serde_json::from_str(value).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Opaque metadata deserialization failed: {e}" + )) + }) + }, + ) + } + + fn supports_data_type(&self, _data_type: &DataType) -> Result<(), ArrowError> { + // Any type + Ok(()) + } + + fn try_new(_data_type: &DataType, metadata: Self::Metadata) -> Result { + Ok(Self::from(metadata)) + } +} diff --git a/arrow-schema/src/extension/canonical/uuid.rs b/arrow-schema/src/extension/canonical/uuid.rs new file mode 100644 index 000000000000..206856265ae5 --- /dev/null +++ b/arrow-schema/src/extension/canonical/uuid.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! UUID +//! +//! + +use crate::{extension::ExtensionType, ArrowError, DataType}; + +/// The extension type for `UUID`. +/// +/// Extension name: `arrow.uuid`. +/// +/// The storage type of the extension is `FixedSizeBinary` with a length of +/// 16 bytes. +/// +/// Note: +/// A specific UUID version is not required or guaranteed. This extension +/// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and +/// does not interpret the bytes in any way. +/// +/// +#[derive(Debug, Default, Clone, Copy, PartialEq)] +pub struct Uuid; + +impl ExtensionType for Uuid { + const NAME: &str = "arrow.uuid"; + + type Metadata = (); + + fn metadata(&self) -> &Self::Metadata { + &() + } + + fn serialize_metadata(&self) -> Option { + None + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + metadata.map_or_else( + || Ok(()), + |_| { + Err(ArrowError::InvalidArgumentError( + "Uuid extension type expects no metadata".to_owned(), + )) + }, + ) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + match data_type { + DataType::FixedSizeBinary(16) => Ok(()), + data_type => Err(ArrowError::InvalidArgumentError(format!( + "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}" + ))), + } + } + + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + Self.supports_data_type(data_type).map(|_| Self) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + extension::{CanonicalExtensionType, EXTENSION_TYPE_METADATA_KEY}, + Field, + }; + + use super::*; + + #[test] + fn uuid() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::FixedSizeBinary(16), false); + field.try_with_extension_type(Uuid)?; + assert!(field.try_extension_type::().is_ok()); + assert_eq!( + field.try_canonical_extension_type().unwrap(), + CanonicalExtensionType::Uuid(Uuid) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")] + fn uuid_bad_type() { + Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid); + } + + #[test] + fn uuid_with_metadata() { + // Add metadata that's not expected for uuid. + let field = Field::new("", DataType::FixedSizeBinary(16), false) + .with_extension_type(Uuid) + .with_metadata( + [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())] + .into_iter() + .collect(), + ); + // This returns an error now because `Uuid` expects no metadata. + assert!(field.try_extension_type::().is_err()); + } +} diff --git a/arrow-schema/src/extension/canonical/variable_shape_tensor.rs b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs new file mode 100644 index 000000000000..8730d6765715 --- /dev/null +++ b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! VariableShapeTensor +//! +//! + +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +use crate::{extension::ExtensionType, ArrowError, DataType, Field}; + +/// The extension type for `VariableShapeTensor`. +/// +/// +/// +/// +#[derive(Debug, Clone, PartialEq)] +pub struct VariableShapeTensor { + /// The data type of individual tensor elements. + value_type: DataType, + + /// The number of dimensions of the tensor. + dimensions: usize, + + /// The metadata of this extension type. + metadata: VariableShapeTensorMetadata, +} + +impl VariableShapeTensor { + /// Returns a new variable shape tensor extension type. + /// + /// # Error + /// + /// Return an error if the provided dimension names or permutations are + /// invalid. + pub fn try_new( + _value_type: DataType, + _dimensions: usize, + _dimension_names: Option>>, + _permutations: Option>>, + ) -> Result { + todo!() + } + + /// Returns the value type of the individual tensor elements. + pub fn value_type(&self) -> &DataType { + &self.value_type + } + + /// Returns the number of dimensions in this variable shape tensor. + pub fn dimensions(&self) -> usize { + self.dimensions + } + + /// Returns the names of the dimensions in this variable shape tensor, if + /// set. + pub fn dimension_names(&self) -> Option<&[String]> { + self.metadata.dimension_names() + } + + /// Returns the indices of the desired ordering of the original + /// dimensions, if set. + pub fn permutations(&self) -> Option<&[usize]> { + self.metadata.permutations() + } +} + +/// Extension type metadata for [`VariableShapeTensor`]. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] +pub struct VariableShapeTensorMetadata { + /// Explicit names to tensor dimensions. + dim_names: Option>, + + /// Indices of the desired ordering of the original dimensions. + permutations: Option>, +} + +impl VariableShapeTensorMetadata { + /// Returns the names of the dimensions in this variable shape tensor, if + /// set. + pub fn dimension_names(&self) -> Option<&[String]> { + self.dim_names.as_ref().map(AsRef::as_ref) + } + + /// Returns the indices of the desired ordering of the original + /// dimensions, if set. + pub fn permutations(&self) -> Option<&[usize]> { + self.permutations.as_ref().map(AsRef::as_ref) + } +} + +impl ExtensionType for VariableShapeTensor { + const NAME: &str = "arrow.variable_shape_tensor"; + + type Metadata = VariableShapeTensorMetadata; + + fn metadata(&self) -> &Self::Metadata { + &self.metadata + } + + fn serialize_metadata(&self) -> Option { + Some(serde_json::to_string(self.metadata()).expect("metadata serialization")) + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + metadata.map_or_else( + || { + Err(ArrowError::InvalidArgumentError( + "VariableShapeTensor extension types requires metadata".to_owned(), + )) + }, + |value| { + serde_json::from_str(value).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "VariableShapeTensor metadata deserialization failed: {e}" + )) + }) + }, + ) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + let expected = DataType::Struct( + [ + Field::new_list( + "data", + Field::new_list_field(self.value_type.clone(), false), + false, + ), + Field::new( + "shape", + DataType::new_fixed_size_list( + DataType::Int32, + i32::try_from(self.dimensions()).expect("overflow"), + false, + ), + false, + ), + ] + .into_iter() + .map(Arc::new) + .collect(), + ); + data_type + .equals_datatype(&expected) + .then_some(()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "VariableShapeTensor data type mismatch, expected {expected}, found {data_type}" + )) + }) + } + + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + match data_type { + DataType::Struct(fields) + if fields.len() == 2 + && matches!(fields.find("data"), Some((0, _))) + && matches!(fields.find("shape"), Some((1, _))) => + { + let _data_field = &fields[0]; + let _shape_field = &fields[1]; + todo!() + } + data_type => Err(ArrowError::InvalidArgumentError(format!( + "VariableShapeTensor data type mismatch, expected Struct with 2 fields (data and shape), found {data_type}" + ))), + } + } +} diff --git a/arrow-schema/src/extension/mod.rs b/arrow-schema/src/extension/mod.rs new file mode 100644 index 000000000000..583334579229 --- /dev/null +++ b/arrow-schema/src/extension/mod.rs @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Extension types. + +#[cfg(feature = "canonical-extension-types")] +mod canonical; +#[cfg(feature = "canonical-extension-types")] +pub use canonical::*; + +use crate::{ArrowError, DataType}; + +/// The metadata key for the string name identifying an [`ExtensionType`]. +pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; + +/// The metadata key for a serialized representation of the [`ExtensionType`] +/// necessary to reconstruct the custom type. +pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; + +/// Extension types. +/// +/// User-defined “extension” types can be defined setting certain key value +/// pairs in the [`Field`] metadata structure. These extension keys are: +/// - [`EXTENSION_TYPE_NAME_KEY`] +/// - [`EXTENSION_TYPE_METADATA_KEY`] +/// +/// Canonical extension types support in this crate requires the +/// `canonical-extension-types` feature. +/// +/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`] +/// field. +/// +/// # Example +/// +/// The example below demonstrates how to implement this trait for a `Uuid` +/// type. Note this is not the canonical extension type for `Uuid`, which does +/// not include information about the `Uuid` version. +/// +/// ``` +/// # use arrow_schema::ArrowError; +/// # fn main() -> Result<(), ArrowError> { +/// use arrow_schema::{DataType, extension::ExtensionType, Field}; +/// use std::{fmt, str::FromStr}; +/// +/// /// The different Uuid versions. +/// #[derive(Clone, Copy, Debug, PartialEq)] +/// enum UuidVersion { +/// V1, +/// V2, +/// V3, +/// V4, +/// V5, +/// V6, +/// V7, +/// V8, +/// } +/// +/// // We'll use `Display` to serialize. +/// impl fmt::Display for UuidVersion { +/// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +/// write!( +/// f, +/// "{}", +/// match self { +/// Self::V1 => "V1", +/// Self::V2 => "V2", +/// Self::V3 => "V3", +/// Self::V4 => "V4", +/// Self::V5 => "V5", +/// Self::V6 => "V6", +/// Self::V7 => "V7", +/// Self::V8 => "V8", +/// } +/// ) +/// } +/// } +/// +/// // And `FromStr` to deserialize. +/// impl FromStr for UuidVersion { +/// type Err = ArrowError; +/// +/// fn from_str(s: &str) -> Result { +/// match s { +/// "V1" => Ok(Self::V1), +/// "V2" => Ok(Self::V2), +/// "V3" => Ok(Self::V3), +/// "V4" => Ok(Self::V4), +/// "V5" => Ok(Self::V5), +/// "V6" => Ok(Self::V6), +/// "V7" => Ok(Self::V7), +/// "V8" => Ok(Self::V8), +/// _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())), +/// } +/// } +/// } +/// +/// /// This is the extension type, not the container for Uuid values. It +/// /// stores the Uuid version (this is the metadata of this extension type). +/// #[derive(Clone, Copy, Debug, PartialEq)] +/// struct Uuid(UuidVersion); +/// +/// impl ExtensionType for Uuid { +/// // We use a namespace as suggested by the specification. +/// const NAME: &str = "myorg.example.uuid"; +/// +/// // The metadata type is the Uuid version. +/// type Metadata = UuidVersion; +/// +/// // We just return a reference to the Uuid version. +/// fn metadata(&self) -> &Self::Metadata { +/// &self.0 +/// } +/// +/// // We use the `Display` implementation to serialize the Uuid +/// // version. +/// fn serialize_metadata(&self) -> Option { +/// Some(self.0.to_string()) +/// } +/// +/// // We use the `FromStr` implementation to deserialize the Uuid +/// // version. +/// fn deserialize_metadata(metadata: Option<&str>) -> Result { +/// metadata.map_or_else( +/// || { +/// Err(ArrowError::InvalidArgumentError( +/// "Uuid extension type metadata missing".to_owned(), +/// )) +/// }, +/// str::parse, +/// ) +/// } +/// +/// // The only supported data type is `FixedSizeBinary(16)`. +/// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +/// match data_type { +/// DataType::FixedSizeBinary(16) => Ok(()), +/// data_type => Err(ArrowError::InvalidArgumentError(format!( +/// "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}" +/// ))), +/// } +/// } +/// +/// // We should always check if the data type is supported before +/// // constructing the extension type. +/// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { +/// let uuid = Self(metadata); +/// uuid.supports_data_type(data_type)?; +/// Ok(uuid) +/// } +/// } +/// +/// // We can now construct the extension type. +/// let uuid_v1 = Uuid(UuidVersion::V1); +/// +/// // And add it to a field. +/// let mut field = +/// Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1); +/// +/// // And extract it from this field. +/// assert_eq!(field.try_extension_type::()?, uuid_v1); +/// +/// // When we try to add this to a field with an unsupported data type we +/// // get an error. +/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1); +/// assert!(result.is_err()); +/// # Ok(()) } +/// ``` +/// +/// +/// +/// [`Field`]: crate::Field +pub trait ExtensionType: Sized { + /// The name identifying this extension type. + /// + /// This is the string value that is used for the + /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`] + /// to identify this extension type. + /// + /// We recommend that you use a “namespace”-style prefix for extension + /// type names to minimize the possibility of conflicts with multiple Arrow + /// readers and writers in the same application. For example, use + /// `myorg.name_of_type` instead of simply `name_of_type`. + /// + /// Extension names beginning with `arrow.` are reserved for canonical + /// extension types, they should not be used for third-party extension + /// types. + /// + /// [`Field`]: crate::Field + /// [`Field::metadata`]: crate::Field::metadata + const NAME: &str; + + /// The metadata type of this extension type. + /// + /// If an extension type defines no metadata it should use `()` to indicate + /// this. + type Metadata; + + /// Returns a reference to the metadata of this extension type, or `&()` if + /// if this extension type defines no metadata (`Self::Metadata=()`). + fn metadata(&self) -> &Self::Metadata; + + /// Returns the serialized representation of the metadata of this extension + /// type, or `None` if this extension type defines no metadata + /// (`Self::Metadata=()`). + /// + /// This is string value that is used for the + /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a + /// [`Field`]. + /// + /// [`Field`]: crate::Field + /// [`Field::metadata`]: crate::Field::metadata + fn serialize_metadata(&self) -> Option; + + /// Deserialize the metadata of this extension type from the serialized + /// representation of the metadata. An extension type that defines no + /// metadata should expect `None` for the serialized metadata and return + /// `Ok(())`. + /// + /// This function should return an error when + /// - expected metadata is missing (for extensions types with non-optional + /// metadata) + /// - unexpected metadata is set (for extension types without metadata) + /// - deserialization of metadata fails + fn deserialize_metadata(metadata: Option<&str>) -> Result; + + /// Returns `OK())` iff the given data type is supported by this extension + /// type. + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError>; + + /// Construct this extension type for a field with the given data type and + /// metadata. + /// + /// This should return an error if the given data type is not supported by + /// this extension type. + fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result; +} diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index f16e2f9bbc05..5d373496fae7 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::canonical_extension_types::{CanonicalExtensionTypes, Json, Uuid}; use crate::error::ArrowError; use std::cmp::Ordering; use std::collections::HashMap; @@ -23,10 +22,12 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use crate::datatype::DataType; +#[cfg(feature = "canonical-extension-types")] +use crate::extension::CanonicalExtensionType; use crate::schema::SchemaBuilder; use crate::{ - ExtensionType, ExtensionTypeExt, Fields, UnionFields, UnionMode, EXTENSION_TYPE_METADATA_KEY, - EXTENSION_TYPE_NAME_KEY, + extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}, + Fields, UnionFields, UnionMode, }; /// A reference counted [`Field`] @@ -341,48 +342,139 @@ impl Field { self } - /// Returns the given [`ExtensionType`] of this [`Field`], if set. - /// Returns `None` if this field does not have this extension type. - pub fn extension_type(&self) -> Option { - E::try_from_field(self) + /// Returns the extension type name of this [`Field`], if set. + /// + /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in + /// [`Field::metadata`]. If the key is missing, there is no extension type + /// name and this returns `None`. + /// + /// # Example + /// + /// ``` + /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field}; + /// + /// let field = Field::new("", DataType::Null, false); + /// assert_eq!(field.extension_type_name(), None); + /// + /// let field = Field::new("", DataType::Null, false).with_metadata( + /// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())] + /// .into_iter() + /// .collect(), + /// ); + /// assert_eq!(field.extension_type_name(), Some("example")); + /// ``` + pub fn extension_type_name(&self) -> Option<&str> { + self.metadata() + .get(EXTENSION_TYPE_NAME_KEY) + .map(String::as_ref) } - /// Returns the [`CanonicalExtensionTypes`] of this [`Field`], if set. - pub fn canonical_extension_type(&self) -> Option { - Json::try_from_field(self) - .map(Into::into) - .or(Uuid::try_from_field(self).map(Into::into)) + /// Returns the extension type metadata of this [`Field`], if set. + /// + /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in + /// [`Field::metadata`]. If the key is missing, there is no extension type + /// metadata and this returns `None`. + /// + /// # Example + /// + /// ``` + /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field}; + /// + /// let field = Field::new("", DataType::Null, false); + /// assert_eq!(field.extension_type_metadata(), None); + /// + /// let field = Field::new("", DataType::Null, false).with_metadata( + /// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())] + /// .into_iter() + /// .collect(), + /// ); + /// assert_eq!(field.extension_type_metadata(), Some("example")); + /// ``` + pub fn extension_type_metadata(&self) -> Option<&str> { + self.metadata() + .get(EXTENSION_TYPE_METADATA_KEY) + .map(String::as_ref) + } + + /// Returns an instance of the given [`ExtensionType`] of this [`Field`], + /// if set in the [`Field::metadata`]. + /// + /// # Error + /// + /// Returns an error if + /// - this field does not have the name of this extension type + /// ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or + /// missing) + /// - the deserialization of the metadata + /// ([`ExtensionType::deserialize_metadata`]) fails + /// - the construction of the extension type ([`ExtensionType::try_new`]) + /// fail (for example when the [`Field::data_type`] is not supported by + /// the extension type ([`ExtensionType::supports_data_type`])) + pub fn try_extension_type(&self) -> Result { + // Check the extension name in the metadata + match self.extension_type_name() { + // It should match the name of the given extension type + Some(name) if name == E::NAME => { + // Deserialize the metadata and try to construct the extension + // type + E::deserialize_metadata(self.extension_type_metadata()) + .and_then(|metadata| E::try_new(self.data_type(), metadata)) + } + // Name mismatch + Some(name) => Err(ArrowError::InvalidArgumentError(format!( + "Field extension type name mismatch, expected {}, found {name}", + E::NAME + ))), + // Name missing + None => Err(ArrowError::InvalidArgumentError( + "Field extension type name missing".to_owned(), + )), + } + } + + /// Returns an instance of the given [`ExtensionType`] of this [`Field`], + /// panics if this [`Field`] does not have this extension type. + /// + /// # Panic + /// + /// This calls [`Field::try_extension_type`] and panics when it returns an + /// error. + pub fn extension_type(&self) -> E { + self.try_extension_type::() + .unwrap_or_else(|e| panic!("{e}")) } /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] - /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. + /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the + /// given extension type supports the [`Field::data_type`] of this field + /// ([`ExtensionType::supports_data_type`]). + /// + /// If the given extension type defines no metadata, a previously set + /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared. /// /// # Error /// - /// This functions returns an error if the datatype of this field does not - /// match the storage type of the given extension type. + /// This functions returns an error if the data type of this field does not + /// match any of the supported storage types of the given extension type. pub fn try_with_extension_type( &mut self, extension_type: E, ) -> Result<(), ArrowError> { - if extension_type.supports(&self.data_type) { - // Insert the name - self.metadata - .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); - // Insert the metadata, if any - if let Some(metadata) = extension_type.serialized_metadata() { - self.metadata - .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata); - } - Ok(()) - } else { - Err(ArrowError::InvalidArgumentError(format!( - "storage type of extension type {} does not match field data type, expected {}, found {}", - ::NAME, - extension_type.storage_types().iter().map(ToString::to_string).collect::>().join(" or "), - self.data_type - ))) - } + // Make sure the data type of this field is supported + extension_type.supports_data_type(&self.data_type)?; + + self.metadata + .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); + match extension_type.serialize_metadata() { + Some(metadata) => self + .metadata + .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata), + // If this extension type has no metadata, we make sure to + // clear previously set metadata. + None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY), + }; + + Ok(()) } /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] @@ -390,14 +482,44 @@ impl Field { /// /// # Panics /// - /// This functions panics if the datatype of this field does match the - /// storage type of the given extension type. + /// This calls [`Field::try_with_extension_type`] and panics when it + /// returns an error. pub fn with_extension_type(mut self, extension_type: E) -> Self { self.try_with_extension_type(extension_type) .unwrap_or_else(|e| panic!("{e}")); self } + /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set. + /// + /// # Error + /// + /// Returns an error if + /// - this field does have a canonical extension type (mismatch or missing) + /// - the canonical extension is not supported + /// - the construction of the extension type fails + #[cfg(feature = "canonical-extension-types")] + pub fn try_canonical_extension_type(&self) -> Result { + use crate::extension::{FixedShapeTensor, Json, Uuid}; + + // Canonical extension type names start with `arrow.` + match self.extension_type_name() { + // An extension type name with an `arrow.` prefix + Some(name) if name.starts_with("arrow.") => match name { + FixedShapeTensor::NAME => self.try_extension_type::().map(Into::into), + Json::NAME => self.try_extension_type::().map(Into::into), + Uuid::NAME => self.try_extension_type::().map(Into::into), + _ => Err(ArrowError::InvalidArgumentError(format!("Unsupported canonical extension type: {name}"))), + }, + // Name missing the expected prefix + Some(name) => Err(ArrowError::InvalidArgumentError(format!( + "Field extension type name mismatch, expected a name with an `arrow.` prefix, found {name}" + ))), + // Name missing + None => Err(ArrowError::InvalidArgumentError("Field extension type name missing".to_owned())), + } + } + /// Indicates whether this [`Field`] supports null values. #[inline] pub const fn is_nullable(&self) -> bool { diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index d06382fbcdf7..a83e23e27592 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -25,6 +25,7 @@ use std::fmt::Display; mod datatype_parse; mod error; pub use error::*; +pub mod extension; mod field; pub use field::*; mod fields; diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 1d38e67a0f02..00f42d598be5 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -102,6 +102,8 @@ default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] lz4 = ["lz4_flex"] # Enable arrow reader/writer APIs arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] +# Enable support for arrow canonical extension types +arrow-canonical-extension-types = ["arrow-schema?/canonical-extension-types"] # Enable CLI tools cli = ["json", "base64", "clap", "arrow-csv", "serde"] # Enable JSON APIs diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 8a15037825d0..82bcc8db6a8e 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -23,13 +23,14 @@ //! //! The interfaces for converting arrow schema to parquet schema is coming. -use arrow_schema::canonical_extension_types::Uuid; use base64::prelude::BASE64_STANDARD; use base64::Engine; use std::collections::HashMap; use std::sync::Arc; use arrow_ipc::writer; +#[cfg(feature = "arrow-canonical-extension-types")] +use arrow_schema::extension::Uuid; use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ @@ -472,8 +473,16 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .with_length(*length) - // If set, map arrow uuid extension type to parquet uuid logical type. - .with_logical_type(field.extension_type::().map(|_| LogicalType::Uuid)) + .with_logical_type( + #[cfg(feature = "arrow-canonical-extension-types")] + // If set, map arrow uuid extension type to parquet uuid logical type. + field + .try_extension_type::() + .ok() + .map(|_| LogicalType::Uuid), + #[cfg(not(feature = "arrow-canonical-extension-types"))] + None, + ) .build() } DataType::BinaryView => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) @@ -1937,6 +1946,7 @@ mod tests { } #[test] + #[cfg(feature = "arrow-canonical-extension-types")] fn arrow_uuid_to_parquet_uuid() -> Result<()> { let arrow_schema = Schema::new(vec![Field::new( "uuid",