mbrobbel commented on code in PR #5822: URL: https://github.com/apache/arrow-rs/pull/5822#discussion_r1925528709
########## arrow-schema/src/extension/mod.rs: ########## @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Extension types. + +#[cfg(feature = "canonical-extension-types")] +mod canonical; +#[cfg(feature = "canonical-extension-types")] +pub use canonical::*; + +use crate::{ArrowError, DataType}; + +/// The metadata key for the string name identifying an [`ExtensionType`]. +pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; + +/// The metadata key for a serialized representation of the [`ExtensionType`] +/// necessary to reconstruct the custom type. +pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; + +/// Extension types. +/// +/// User-defined “extension” types can be defined setting certain key value +/// pairs in the [`Field`] metadata structure. These extension keys are: +/// - [`EXTENSION_TYPE_NAME_KEY`] +/// - [`EXTENSION_TYPE_METADATA_KEY`] +/// +/// Canonical extension types support in this crate requires the +/// `canonical-extension-types` feature. +/// +/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`] +/// field. +/// +/// # Example +/// +/// The example below demonstrates how to implement this trait for a `Uuid` +/// type. Note this is not the canonical extension type for `Uuid`, which does +/// not include information about the `Uuid` version. +/// +/// ``` +/// # use arrow_schema::ArrowError; +/// # fn main() -> Result<(), ArrowError> { +/// use arrow_schema::{DataType, extension::ExtensionType, Field}; +/// use std::{fmt, str::FromStr}; +/// +/// /// The different Uuid versions. +/// #[derive(Clone, Copy, Debug, PartialEq)] +/// enum UuidVersion { +/// V1, +/// V2, +/// V3, +/// V4, +/// V5, +/// V6, +/// V7, +/// V8, +/// } +/// +/// // We'll use `Display` to serialize. +/// impl fmt::Display for UuidVersion { +/// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +/// write!( +/// f, +/// "{}", +/// match self { +/// Self::V1 => "V1", +/// Self::V2 => "V2", +/// Self::V3 => "V3", +/// Self::V4 => "V4", +/// Self::V5 => "V5", +/// Self::V6 => "V6", +/// Self::V7 => "V7", +/// Self::V8 => "V8", +/// } +/// ) +/// } +/// } +/// +/// // And `FromStr` to deserialize. +/// impl FromStr for UuidVersion { +/// type Err = ArrowError; +/// +/// fn from_str(s: &str) -> Result<Self, Self::Err> { +/// match s { +/// "V1" => Ok(Self::V1), +/// "V2" => Ok(Self::V2), +/// "V3" => Ok(Self::V3), +/// "V4" => Ok(Self::V4), +/// "V5" => Ok(Self::V5), +/// "V6" => Ok(Self::V6), +/// "V7" => Ok(Self::V7), +/// "V8" => Ok(Self::V8), +/// _ => Err(ArrowError::ParseError("Invalid UuidVersion".to_owned())), +/// } +/// } +/// } +/// +/// /// This is the extension type, not the container for Uuid values. It +/// /// stores the Uuid version (this is the metadata of this extension type). +/// #[derive(Clone, Copy, Debug, PartialEq)] +/// struct Uuid(UuidVersion); +/// +/// impl ExtensionType for Uuid { +/// // We use a namespace as suggested by the specification. +/// const NAME: &'static str = "myorg.example.uuid"; +/// +/// // The metadata type is the Uuid version. +/// type Metadata = UuidVersion; +/// +/// // We just return a reference to the Uuid version. +/// fn metadata(&self) -> &Self::Metadata { +/// &self.0 +/// } +/// +/// // We use the `Display` implementation to serialize the Uuid +/// // version. +/// fn serialize_metadata(&self) -> Option<String> { +/// Some(self.0.to_string()) +/// } +/// +/// // We use the `FromStr` implementation to deserialize the Uuid +/// // version. +/// fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> { +/// metadata.map_or_else( +/// || { +/// Err(ArrowError::InvalidArgumentError( +/// "Uuid extension type metadata missing".to_owned(), +/// )) +/// }, +/// str::parse, +/// ) +/// } +/// +/// // The only supported data type is `FixedSizeBinary(16)`. +/// fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { +/// match data_type { +/// DataType::FixedSizeBinary(16) => Ok(()), +/// data_type => Err(ArrowError::InvalidArgumentError(format!( +/// "Uuid data type mismatch, expected FixedSizeBinary(16), found {data_type}" +/// ))), +/// } +/// } +/// +/// // We should always check if the data type is supported before +/// // constructing the extension type. +/// fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> { +/// let uuid = Self(metadata); +/// uuid.supports_data_type(data_type)?; +/// Ok(uuid) +/// } +/// } +/// +/// // We can now construct the extension type. +/// let uuid_v1 = Uuid(UuidVersion::V1); +/// +/// // And add it to a field. +/// let mut field = +/// Field::new("", DataType::FixedSizeBinary(16), false).with_extension_type(uuid_v1); +/// +/// // And extract it from this field. +/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1); +/// +/// // When we try to add this to a field with an unsupported data type we +/// // get an error. +/// let result = Field::new("", DataType::Null, false).try_with_extension_type(uuid_v1); +/// assert!(result.is_err()); +/// # Ok(()) } +/// ``` +/// +/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types> +/// +/// [`Field`]: crate::Field +pub trait ExtensionType: Sized { + /// The name identifying this extension type. + /// + /// This is the string value that is used for the + /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`] + /// to identify this extension type. + /// + /// We recommend that you use a “namespace”-style prefix for extension + /// type names to minimize the possibility of conflicts with multiple Arrow + /// readers and writers in the same application. For example, use + /// `myorg.name_of_type` instead of simply `name_of_type`. + /// + /// Extension names beginning with `arrow.` are reserved for canonical + /// extension types, they should not be used for third-party extension + /// types. + /// + /// Extension names are case-sensitive. + /// + /// [`Field`]: crate::Field + /// [`Field::metadata`]: crate::Field::metadata + const NAME: &'static str; + + /// The metadata type of this extension type. + /// + /// If an extension type defines no metadata it should use `()` to indicate + /// this. + type Metadata; + + /// Returns a reference to the metadata of this extension type, or `&()` if + /// if this extension type defines no metadata (`Self::Metadata=()`). + fn metadata(&self) -> &Self::Metadata; + + /// Returns the serialized representation of the metadata of this extension + /// type, or `None` if this extension type defines no metadata + /// (`Self::Metadata=()`). + /// + /// This is string value that is used for the + /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a + /// [`Field`]. + /// + /// [`Field`]: crate::Field + /// [`Field::metadata`]: crate::Field::metadata + fn serialize_metadata(&self) -> Option<String>; Review Comment: Per https://arrow.apache.org/docs/format/Columnar.html#extension-types: > Extension types may or may not use the 'ARROW:extension:metadata' field. Which I interpret as; it should also work when this key is missing in the field metadata for extension types that don't require metadata. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
