kylebarron commented on code in PR #5822:
URL: https://github.com/apache/arrow-rs/pull/5822#discussion_r1925627198


##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -2163,4 +2211,52 @@ mod tests {
     fn test_get_arrow_schema_from_metadata() {
         assert!(get_arrow_schema_from_metadata("").is_err());
     }
+
+    #[test]
+    #[cfg(feature = "arrow_canonical_extension_types")]
+    fn arrow_uuid_to_parquet_uuid() -> Result<()> {
+        let arrow_schema = Schema::new(vec![Field::new(
+            "uuid",
+            DataType::FixedSizeBinary(16),
+            false,
+        )
+        .with_extension_type(Uuid)]);
+
+        let parquet_schema = 
ArrowSchemaConverter::new().convert(&arrow_schema)?;
+
+        assert_eq!(
+            parquet_schema.column(0).logical_type(),
+            Some(LogicalType::Uuid)
+        );
+
+        // TODO: roundtrip
+        // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
+        // assert_eq!(arrow_schema.field(0).try_extension_type::<Uuid>()?, 
Uuid);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(feature = "arrow_canonical_extension_types")]
+    fn arrow_json_to_parquet_json() -> Result<()> {
+        let arrow_schema = Schema::new(vec![
+            Field::new("json", DataType::Utf8, 
false).with_extension_type(Json::default())
+        ]);
+
+        let parquet_schema = 
ArrowSchemaConverter::new().convert(&arrow_schema)?;
+
+        assert_eq!(
+            parquet_schema.column(0).logical_type(),
+            Some(LogicalType::Json)
+        );
+
+        // TODO: roundtrip
+        // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
+        // assert_eq!(
+        //     arrow_schema.field(0).try_extension_type::<Json>()?,
+        //     Json::default()
+        // );

Review Comment:
   Is this left for a future PR?



##########
arrow-schema/src/extension/mod.rs:
##########
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Extension types.
+
+#[cfg(feature = "canonical_extension_types")]
+mod canonical;
+#[cfg(feature = "canonical_extension_types")]
+pub use canonical::*;
+
+use crate::{ArrowError, DataType};
+
+/// The metadata key for the string name identifying an [`ExtensionType`].
+pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
+
+/// The metadata key for a serialized representation of the [`ExtensionType`]
+/// necessary to reconstruct the custom type.
+pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
+
+/// Extension types.
+///
+/// User-defined “extension” types can be defined setting certain key value
+/// pairs in the [`Field`] metadata structure. These extension keys are:
+/// - [`EXTENSION_TYPE_NAME_KEY`]
+/// - [`EXTENSION_TYPE_METADATA_KEY`]
+///
+/// Canonical extension types support in this crate requires the
+/// `canonical_extension_types` feature.
+///
+/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
+/// field.
+///
+/// # Example
+///
+/// The example below demonstrates how to implement this trait for a `Uuid`
+/// type. Note this is not the canonical extension type for `Uuid`, which does
+/// not include information about the `Uuid` version.
+///
+/// ```
+/// # use arrow_schema::ArrowError;
+/// # fn main() -> Result<(), ArrowError> {
+/// use arrow_schema::{DataType, extension::ExtensionType, Field};
+/// use std::{fmt, str::FromStr};
+///
+/// /// The different Uuid versions.
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// enum UuidVersion {
+///     V1,
+///     V2,
+///     V3,
+///     V4,
+///     V5,
+///     V6,
+///     V7,
+///     V8,
+/// }
+///
+/// // We'll use `Display` to serialize.
+/// impl fmt::Display for UuidVersion {
+///     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+///         write!(
+///             f,
+///             "{}",
+///             match self {
+///                 Self::V1 => "V1",
+///                 Self::V2 => "V2",
+///                 Self::V3 => "V3",
+///                 Self::V4 => "V4",
+///                 Self::V5 => "V5",
+///                 Self::V6 => "V6",
+///                 Self::V7 => "V7",
+///                 Self::V8 => "V8",
+///             }
+///         )
+///     }
+/// }
+///
+/// // And `FromStr` to deserialize.
+/// impl FromStr for UuidVersion {
+///     type Err = ArrowError;
+///
+///     fn from_str(s: &str) -> Result<Self, Self::Err> {
+///         match s {
+///             "V1" => Ok(Self::V1),
+///             "V2" => Ok(Self::V2),
+///             "V3" => Ok(Self::V3),
+///             "V4" => Ok(Self::V4),
+///             "V5" => Ok(Self::V5),
+///             "V6" => Ok(Self::V6),
+///             "V7" => Ok(Self::V7),
+///             "V8" => Ok(Self::V8),
+///             _ => Err(ArrowError::ParseError("Invalid 
UuidVersion".to_owned())),
+///         }
+///     }
+/// }
+///
+/// /// This is the extension type, not the container for Uuid values. It
+/// /// stores the Uuid version (this is the metadata of this extension type).
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// struct Uuid(UuidVersion);
+///
+/// impl ExtensionType for Uuid {
+///     // We use a namespace as suggested by the specification.
+///     const NAME: &'static str = "myorg.example.uuid";
+///
+///     // The metadata type is the Uuid version.
+///     type Metadata = UuidVersion;
+///
+///     // We just return a reference to the Uuid version.
+///     fn metadata(&self) -> &Self::Metadata {
+///         &self.0
+///     }
+///
+///     // We use the `Display` implementation to serialize the Uuid
+///     // version.
+///     fn serialize_metadata(&self) -> Option<String> {
+///         Some(self.0.to_string())
+///     }
+///
+///     // We use the `FromStr` implementation to deserialize the Uuid
+///     // version.
+///     fn deserialize_metadata(metadata: Option<&str>) -> 
Result<Self::Metadata, ArrowError> {
+///         metadata.map_or_else(
+///             || {
+///                 Err(ArrowError::InvalidArgumentError(
+///                     "Uuid extension type metadata missing".to_owned(),
+///                 ))
+///             },
+///             str::parse,
+///         )
+///     }
+///
+///     // The only supported data type is `FixedSizeBinary(16)`.
+///     fn supports_data_type(&self, data_type: &DataType) -> Result<(), 
ArrowError> {
+///         match data_type {
+///             DataType::FixedSizeBinary(16) => Ok(()),
+///             data_type => Err(ArrowError::InvalidArgumentError(format!(
+///                 "Uuid data type mismatch, expected FixedSizeBinary(16), 
found {data_type}"
+///             ))),
+///         }
+///     }
+///
+///     // We should always check if the data type is supported before
+///     // constructing the extension type.
+///     fn try_new(data_type: &DataType, metadata: Self::Metadata) -> 
Result<Self, ArrowError> {
+///         let uuid = Self(metadata);
+///         uuid.supports_data_type(data_type)?;
+///         Ok(uuid)
+///     }
+/// }
+///
+/// // We can now construct the extension type.
+/// let uuid_v1 = Uuid(UuidVersion::V1);
+///
+/// // And add it to a field.
+/// let mut field =
+///     Field::new("", DataType::FixedSizeBinary(16), 
false).with_extension_type(uuid_v1);
+///
+/// // And extract it from this field.
+/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
+///
+/// // When we try to add this to a field with an unsupported data type we
+/// // get an error.
+/// let result = Field::new("", DataType::Null, 
false).try_with_extension_type(uuid_v1);
+/// assert!(result.is_err());
+/// # Ok(()) }
+/// ```
+///
+/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
+///
+/// [`Field`]: crate::Field
+pub trait ExtensionType: Sized {
+    /// The name identifying this extension type.
+    ///
+    /// This is the string value that is used for the
+    /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
+    /// to identify this extension type.
+    ///
+    /// We recommend that you use a “namespace”-style prefix for extension
+    /// type names to minimize the possibility of conflicts with multiple Arrow
+    /// readers and writers in the same application. For example, use
+    /// `myorg.name_of_type` instead of simply `name_of_type`.
+    ///
+    /// Extension names beginning with `arrow.` are reserved for canonical
+    /// extension types, they should not be used for third-party extension
+    /// types.
+    ///
+    /// Extension names are case-sensitive.
+    ///
+    /// [`Field`]: crate::Field
+    /// [`Field::metadata`]: crate::Field::metadata
+    const NAME: &'static str;
+
+    /// The metadata type of this extension type.
+    ///
+    /// If an extension type defines no metadata it should use `()` to indicate
+    /// this.
+    type Metadata;
+
+    /// Returns a reference to the metadata of this extension type, or `&()` if
+    /// if this extension type defines no metadata (`Self::Metadata=()`).
+    fn metadata(&self) -> &Self::Metadata;
+
+    /// Returns the serialized representation of the metadata of this extension
+    /// type, or `None` if this extension type defines no metadata
+    /// (`Self::Metadata=()`).
+    ///
+    /// This is string value that is used for the
+    /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
+    /// [`Field`].
+    ///
+    /// [`Field`]: crate::Field
+    /// [`Field::metadata`]: crate::Field::metadata
+    fn serialize_metadata(&self) -> Option<String>;

Review Comment:
   This is interesting because it requires the type to contain all information 
to construct its Arrow extension metadata. This is not how I've implemented it 
so far in geoarrow-rs. For example, I define a `Point` type as parameterized 
only by its "coordinate type" (i.e. `FixedSizeList` coordinates or `Struct[x: 
Float, y: Float]` coordinates) and its dimension (`XY`, `XYZ`, and so on). 
Separately, I store [more 
metadata](https://github.com/geoarrow/geoarrow-rs/blob/a2cbc7d501ea4fcd4d5e8927d09bde08ccc34124/rust/geoarrow/src/array/metadata.rs#L57-L81),
 and each array type [stores both pieces of 
data](https://github.com/geoarrow/geoarrow-rs/blob/a2cbc7d501ea4fcd4d5e8927d09bde08ccc34124/rust/geoarrow/src/array/point/array.rs#L27-L28).
   
   So the data type alone is not enough information to serialize the [full 
GeoArrow 
metadata](https://github.com/geoarrow/geoarrow/blob/dd06406d11f2a498b70b6c74f0deda62ec79067b/extension-types.md#extension-metadata),
 which also needs the [Coordinate Reference System 
(CRS)](https://en.wikipedia.org/wiki/Spatial_reference_system). It's a good 
question whether the CRS should be a part of the type here. Perhaps it should, 
but then we need to support CRS equality, which is quite tricky to do, so a 
single logical CRS can be represented in different ways.
   
   cc @paleolimbot , this might be a question for you.



##########
arrow-schema/src/extension/mod.rs:
##########
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Extension types.
+
+#[cfg(feature = "canonical_extension_types")]
+mod canonical;
+#[cfg(feature = "canonical_extension_types")]
+pub use canonical::*;
+
+use crate::{ArrowError, DataType};
+
+/// The metadata key for the string name identifying an [`ExtensionType`].
+pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
+
+/// The metadata key for a serialized representation of the [`ExtensionType`]
+/// necessary to reconstruct the custom type.
+pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
+
+/// Extension types.
+///
+/// User-defined “extension” types can be defined setting certain key value
+/// pairs in the [`Field`] metadata structure. These extension keys are:
+/// - [`EXTENSION_TYPE_NAME_KEY`]
+/// - [`EXTENSION_TYPE_METADATA_KEY`]
+///
+/// Canonical extension types support in this crate requires the
+/// `canonical_extension_types` feature.
+///
+/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
+/// field.
+///
+/// # Example
+///
+/// The example below demonstrates how to implement this trait for a `Uuid`
+/// type. Note this is not the canonical extension type for `Uuid`, which does
+/// not include information about the `Uuid` version.
+///
+/// ```
+/// # use arrow_schema::ArrowError;
+/// # fn main() -> Result<(), ArrowError> {
+/// use arrow_schema::{DataType, extension::ExtensionType, Field};
+/// use std::{fmt, str::FromStr};
+///
+/// /// The different Uuid versions.
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// enum UuidVersion {
+///     V1,
+///     V2,
+///     V3,
+///     V4,
+///     V5,
+///     V6,
+///     V7,
+///     V8,
+/// }
+///
+/// // We'll use `Display` to serialize.
+/// impl fmt::Display for UuidVersion {
+///     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+///         write!(
+///             f,
+///             "{}",
+///             match self {
+///                 Self::V1 => "V1",
+///                 Self::V2 => "V2",
+///                 Self::V3 => "V3",
+///                 Self::V4 => "V4",
+///                 Self::V5 => "V5",
+///                 Self::V6 => "V6",
+///                 Self::V7 => "V7",
+///                 Self::V8 => "V8",
+///             }
+///         )
+///     }
+/// }
+///
+/// // And `FromStr` to deserialize.
+/// impl FromStr for UuidVersion {
+///     type Err = ArrowError;
+///
+///     fn from_str(s: &str) -> Result<Self, Self::Err> {
+///         match s {
+///             "V1" => Ok(Self::V1),
+///             "V2" => Ok(Self::V2),
+///             "V3" => Ok(Self::V3),
+///             "V4" => Ok(Self::V4),
+///             "V5" => Ok(Self::V5),
+///             "V6" => Ok(Self::V6),
+///             "V7" => Ok(Self::V7),
+///             "V8" => Ok(Self::V8),
+///             _ => Err(ArrowError::ParseError("Invalid 
UuidVersion".to_owned())),
+///         }
+///     }
+/// }
+///
+/// /// This is the extension type, not the container for Uuid values. It
+/// /// stores the Uuid version (this is the metadata of this extension type).
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// struct Uuid(UuidVersion);
+///
+/// impl ExtensionType for Uuid {
+///     // We use a namespace as suggested by the specification.
+///     const NAME: &'static str = "myorg.example.uuid";
+///
+///     // The metadata type is the Uuid version.
+///     type Metadata = UuidVersion;
+///
+///     // We just return a reference to the Uuid version.
+///     fn metadata(&self) -> &Self::Metadata {
+///         &self.0

Review Comment:
   My first go at implementing this trait I used:
   ```rs
   pub struct PointType(CoordType, Dimension);
   
   impl ExtensionType for PointType {
       const NAME: &'static str = "geoarrow.point";
   
       type Metadata = (CoordType, Dimension);
   
       fn metadata(&self) -> &Self::Metadata {
           &(self.0, self.1)
       }
   }
   ```
   But that won't work for this because `cannot return reference to temporary 
value`.
   
   So essentially `type Metadata` needs to be the same as whatever is defined 
inside the extension type struct, and that needs to be a single item, so that 
it can return `Self::Metadata` without creating any new items?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to