This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 8baaa8b08f Add `ExtensionType` trait and `CanonicalExtensionType` enum
(#5822)
8baaa8b08f is described below
commit 8baaa8b08f917f147d51e73f2e9325a449f43c79
Author: Matthijs Brobbel <[email protected]>
AuthorDate: Sun Feb 2 13:06:14 2025 +0100
Add `ExtensionType` trait and `CanonicalExtensionType` enum (#5822)
* Add `ExtensionType` for `uuid` and map to parquet logical type
* Fix docs
* Use an `ExtensionType` trait instead
* Fix clippy warnings
* Add type annotation to fix build
* Update `ExtensionType` trait to support more canonical extension types
* Add `Json` support to parquet, schema roundtrip not working yet
* Fix some clippy warnings
* Add explicit lifetime, resolving elided lifetime to static in assoc const
was added in 1.81
* Replace use of deprecated method, mark roundtrip as todo
* Add more tests and missing impls
* Add missing type annotations
* Fix doc warning
* Add the feature to the `arrow` crate and use underscores
* Update feature name in `parquet` crate
* Add experimental warning to `extensions` module docs
* Add a note about the associated metadata type
* Fix `Json` canonical extension type empty string metadata
* Simplify `Bool8::deserialize_metadata`
* Use `Empty` instead of `serde_json::Map` in `JsonMetadata`
* Use `map_or` instead of `is_some_and` (msrv)
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/list_view_array.rs | 8 +-
arrow-schema/Cargo.toml | 12 +-
arrow-schema/src/extension/canonical/bool8.rs | 142 ++++++
.../src/extension/canonical/fixed_shape_tensor.rs | 443 +++++++++++++++++
arrow-schema/src/extension/canonical/json.rs | 198 ++++++++
arrow-schema/src/extension/canonical/mod.rs | 142 ++++++
arrow-schema/src/extension/canonical/opaque.rs | 201 ++++++++
arrow-schema/src/extension/canonical/uuid.rs | 128 +++++
.../extension/canonical/variable_shape_tensor.rs | 551 +++++++++++++++++++++
arrow-schema/src/extension/mod.rs | 260 ++++++++++
arrow-schema/src/field.rs | 168 ++++++-
arrow-schema/src/lib.rs | 1 +
arrow-select/src/dictionary.rs | 2 +-
arrow/Cargo.toml | 1 +
arrow/README.md | 1 +
parquet/Cargo.toml | 2 +
parquet/src/arrow/schema/mod.rs | 102 +++-
17 files changed, 2350 insertions(+), 12 deletions(-)
diff --git a/arrow-array/src/array/list_view_array.rs
b/arrow-array/src/array/list_view_array.rs
index 195ac7e116..6118607bcb 100644
--- a/arrow-array/src/array/list_view_array.rs
+++ b/arrow-array/src/array/list_view_array.rs
@@ -895,8 +895,8 @@ mod tests {
.build()
.unwrap(),
);
- assert_eq!(string.value_offsets(), &[]);
- assert_eq!(string.value_sizes(), &[]);
+ assert_eq!(string.value_offsets(), &[] as &[i32; 0]);
+ assert_eq!(string.value_sizes(), &[] as &[i32; 0]);
let string = LargeListViewArray::from(
ArrayData::builder(DataType::LargeListView(f))
@@ -906,8 +906,8 @@ mod tests {
.unwrap(),
);
assert_eq!(string.len(), 0);
- assert_eq!(string.value_offsets(), &[]);
- assert_eq!(string.value_sizes(), &[]);
+ assert_eq!(string.value_offsets(), &[] as &[i64; 0]);
+ assert_eq!(string.value_sizes(), &[] as &[i64; 0]);
}
#[test]
diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml
index d1bcf046b7..ffea42db66 100644
--- a/arrow-schema/Cargo.toml
+++ b/arrow-schema/Cargo.toml
@@ -34,21 +34,27 @@ path = "src/lib.rs"
bench = false
[dependencies]
-serde = { version = "1.0", default-features = false, features = ["derive",
"std", "rc"], optional = true }
+serde = { version = "1.0", default-features = false, features = [
+ "derive",
+ "std",
+ "rc",
+], optional = true }
bitflags = { version = "2.0.0", default-features = false, optional = true }
+serde_json = { version = "1.0", optional = true }
[features]
+canonical_extension_types = ["dep:serde", "dep:serde_json"]
# Enable ffi support
ffi = ["bitflags"]
+serde = ["dep:serde"]
[package.metadata.docs.rs]
features = ["ffi"]
[dev-dependencies]
-serde_json = "1.0"
bincode = { version = "1.3.3", default-features = false }
criterion = { version = "0.5", default-features = false }
[[bench]]
name = "ffi"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/arrow-schema/src/extension/canonical/bool8.rs
b/arrow-schema/src/extension/canonical/bool8.rs
new file mode 100644
index 0000000000..3f6c50cb3e
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/bool8.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! 8-bit Boolean
+//!
+//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
+
+use crate::{extension::ExtensionType, ArrowError, DataType};
+
+/// The extension type for `8-bit Boolean`.
+///
+/// Extension name: `arrow.bool8`.
+///
+/// The storage type of the extension is `Int8` where:
+/// - false is denoted by the value 0.
+/// - true can be specified using any non-zero value. Preferably 1.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
+#[derive(Debug, Default, Clone, Copy, PartialEq)]
+pub struct Bool8;
+
+impl ExtensionType for Bool8 {
+ const NAME: &'static str = "arrow.bool8";
+
+ type Metadata = &'static str;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &""
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ Some(String::default())
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ if metadata.map_or(false, str::is_empty) {
+ Ok("")
+ } else {
+ Err(ArrowError::InvalidArgumentError(
+ "Bool8 extension type expects an empty string as
metadata".to_owned(),
+ ))
+ }
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ match data_type {
+ DataType::Int8 => Ok(()),
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "Bool8 data type mismatch, expected Int8, found {data_type}"
+ ))),
+ }
+ }
+
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ Self.supports_data_type(data_type).map(|_| Self)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let mut field = Field::new("", DataType::Int8, false);
+ field.try_with_extension_type(Bool8)?;
+ field.try_extension_type::<Bool8>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::Bool8(Bool8)
+ );
+
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = Field::new("", DataType::Int8, false).with_metadata(
+ [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Bool8>();
+ }
+
+ #[test]
+ #[should_panic(expected = "expected Int8, found Boolean")]
+ fn invalid_type() {
+ Field::new("", DataType::Boolean, false).with_extension_type(Bool8);
+ }
+
+ #[test]
+ #[should_panic(expected = "Bool8 extension type expects an empty string as
metadata")]
+ fn missing_metadata() {
+ let field = Field::new("", DataType::Int8, false).with_metadata(
+ [(EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Bool8>();
+ }
+
+ #[test]
+ #[should_panic(expected = "Bool8 extension type expects an empty string as
metadata")]
+ fn invalid_metadata() {
+ let field = Field::new("", DataType::Int8, false).with_metadata(
+ [
+ (EXTENSION_TYPE_NAME_KEY.to_owned(), Bool8::NAME.to_owned()),
+ (
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ "non-empty".to_owned(),
+ ),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Bool8>();
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs
b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs
new file mode 100644
index 0000000000..6fe94fba78
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/fixed_shape_tensor.rs
@@ -0,0 +1,443 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! FixedShapeTensor
+//!
+//!
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor>
+
+use serde::{Deserialize, Serialize};
+
+use crate::{extension::ExtensionType, ArrowError, DataType};
+
+/// The extension type for fixed shape tensor.
+///
+/// Extension name: `arrow.fixed_shape_tensor`.
+///
+/// The storage type of the extension: `FixedSizeList` where:
+/// - `value_type` is the data type of individual tensor elements.
+/// - `list_size` is the product of all the elements in tensor shape.
+///
+/// Extension type parameters:
+/// - `value_type`: the Arrow data type of individual tensor elements.
+/// - `shape`: the physical shape of the contained tensors as an array.
+///
+/// Optional parameters describing the logical layout:
+/// - `dim_names`: explicit names to tensor dimensions as an array. The
+/// length of it should be equal to the shape length and equal to the
+/// number of dimensions.
+/// `dim_names` can be used if the dimensions have
+/// well-known names and they map to the physical layout (row-major).
+/// - `permutation`: indices of the desired ordering of the original
+/// dimensions, defined as an array.
+/// The indices contain a permutation of the values `[0, 1, .., N-1]`
+/// where `N` is the number of dimensions. The permutation indicates
+/// which dimension of the logical layout corresponds to which dimension
+/// of the physical tensor (the i-th dimension of the logical view
+/// corresponds to the dimension with number `permutations[i]` of the
+/// physical tensor).
+/// Permutation can be useful in case the logical order of the tensor is
+/// a permutation of the physical order (row-major).
+/// When logical and physical layout are equal, the permutation will
+/// always be `([0, 1, .., N-1])` and can therefore be left out.
+///
+/// Description of the serialization:
+/// The metadata must be a valid JSON object including shape of the
+/// contained tensors as an array with key `shape` plus optional
+/// dimension names with keys `dim_names` and ordering of the
+/// dimensions with key `permutation`.
+/// Example: `{ "shape": [2, 5]}`
+/// Example with `dim_names` metadata for NCHW ordered data:
+/// `{ "shape": [100, 200, 500], "dim_names": ["C", "H", "W"]}`
+/// Example of permuted 3-dimensional tensor:
+/// `{ "shape": [100, 200, 500], "permutation": [2, 0, 1]}`
+///
+/// This is the physical layout shape and the shape of the logical layout
+/// would in this case be `[500, 100, 200]`.
+///
+///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor>
+#[derive(Debug, Clone, PartialEq)]
+pub struct FixedShapeTensor {
+ /// The data type of individual tensor elements.
+ value_type: DataType,
+
+ /// The metadata of this extension type.
+ metadata: FixedShapeTensorMetadata,
+}
+
+impl FixedShapeTensor {
+ /// Returns a new fixed shape tensor extension type.
+ ///
+ /// # Error
+ ///
+ /// Return an error if the provided dimension names or permutations are
+ /// invalid.
+ pub fn try_new(
+ value_type: DataType,
+ shape: impl IntoIterator<Item = usize>,
+ dimension_names: Option<Vec<String>>,
+ permutations: Option<Vec<usize>>,
+ ) -> Result<Self, ArrowError> {
+ // TODO: are all data types are suitable as value type?
+ FixedShapeTensorMetadata::try_new(shape, dimension_names,
permutations).map(|metadata| {
+ Self {
+ value_type,
+ metadata,
+ }
+ })
+ }
+
+ /// Returns the value type of the individual tensor elements.
+ pub fn value_type(&self) -> &DataType {
+ &self.value_type
+ }
+
+ /// Returns the product of all the elements in tensor shape.
+ pub fn list_size(&self) -> usize {
+ self.metadata.list_size()
+ }
+
+ /// Returns the number of dimensions in this fixed shape tensor.
+ pub fn dimensions(&self) -> usize {
+ self.metadata.dimensions()
+ }
+
+ /// Returns the names of the dimensions in this fixed shape tensor, if
+ /// set.
+ pub fn dimension_names(&self) -> Option<&[String]> {
+ self.metadata.dimension_names()
+ }
+
+ /// Returns the indices of the desired ordering of the original
+ /// dimensions, if set.
+ pub fn permutations(&self) -> Option<&[usize]> {
+ self.metadata.permutations()
+ }
+}
+
+/// Extension type metadata for [`FixedShapeTensor`].
+#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
+pub struct FixedShapeTensorMetadata {
+ /// The physical shape of the contained tensors.
+ shape: Vec<usize>,
+
+ /// Explicit names to tensor dimensions.
+ dim_names: Option<Vec<String>>,
+
+ /// Indices of the desired ordering of the original dimensions.
+ permutations: Option<Vec<usize>>,
+}
+
+impl FixedShapeTensorMetadata {
+ /// Returns metadata for a fixed shape tensor extension type.
+ ///
+ /// # Error
+ ///
+ /// Return an error if the provided dimension names or permutations are
+ /// invalid.
+ pub fn try_new(
+ shape: impl IntoIterator<Item = usize>,
+ dimension_names: Option<Vec<String>>,
+ permutations: Option<Vec<usize>>,
+ ) -> Result<Self, ArrowError> {
+ let shape = shape.into_iter().collect::<Vec<_>>();
+ let dimensions = shape.len();
+
+ let dim_names = dimension_names.map(|dimension_names| {
+ if dimension_names.len() != dimensions {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor dimension names size mismatch, expected
{dimensions}, found {}", dimension_names.len()
+ )))
+ } else {
+ Ok(dimension_names)
+ }
+ }).transpose()?;
+
+ let permutations = permutations
+ .map(|permutations| {
+ if permutations.len() != dimensions {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor permutations size mismatch, expected
{dimensions}, found {}",
+ permutations.len()
+ )))
+ } else {
+ let mut sorted_permutations = permutations.clone();
+ sorted_permutations.sort_unstable();
+ if (0..dimensions).zip(sorted_permutations).any(|(a, b)| a
!= b) {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor permutations invalid, expected a
permutation of [0, 1, .., N-1], where N is the number of dimensions:
{dimensions}"
+ )))
+ } else {
+ Ok(permutations)
+ }
+ }
+ })
+ .transpose()?;
+
+ Ok(Self {
+ shape,
+ dim_names,
+ permutations,
+ })
+ }
+
+ /// Returns the product of all the elements in tensor shape.
+ pub fn list_size(&self) -> usize {
+ self.shape.iter().product()
+ }
+
+ /// Returns the number of dimensions in this fixed shape tensor.
+ pub fn dimensions(&self) -> usize {
+ self.shape.len()
+ }
+
+ /// Returns the names of the dimensions in this fixed shape tensor, if
+ /// set.
+ pub fn dimension_names(&self) -> Option<&[String]> {
+ self.dim_names.as_ref().map(AsRef::as_ref)
+ }
+
+ /// Returns the indices of the desired ordering of the original
+ /// dimensions, if set.
+ pub fn permutations(&self) -> Option<&[usize]> {
+ self.permutations.as_ref().map(AsRef::as_ref)
+ }
+}
+
+impl ExtensionType for FixedShapeTensor {
+ const NAME: &'static str = "arrow.fixed_shape_tensor";
+
+ type Metadata = FixedShapeTensorMetadata;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &self.metadata
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ Some(serde_json::to_string(&self.metadata).expect("metadata
serialization"))
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ metadata.map_or_else(
+ || {
+ Err(ArrowError::InvalidArgumentError(
+ "FixedShapeTensor extension types requires
metadata".to_owned(),
+ ))
+ },
+ |value| {
+ serde_json::from_str(value).map_err(|e| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor metadata deserialization failed: {e}"
+ ))
+ })
+ },
+ )
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ let expected = DataType::new_fixed_size_list(
+ self.value_type.clone(),
+ i32::try_from(self.list_size()).expect("overflow"),
+ false,
+ );
+ data_type
+ .equals_datatype(&expected)
+ .then_some(())
+ .ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor data type mismatch, expected {expected},
found {data_type}"
+ ))
+ })
+ }
+
+ fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self,
ArrowError> {
+ match data_type {
+ DataType::FixedSizeList(field, list_size) if !field.is_nullable()
=> {
+ // Make sure the metadata is valid.
+ let metadata = FixedShapeTensorMetadata::try_new(
+ metadata.shape,
+ metadata.dim_names,
+ metadata.permutations,
+ )?;
+ // Make sure it is compatible with this data type.
+ let expected_size =
i32::try_from(metadata.list_size()).expect("overflow");
+ if *list_size != expected_size {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor list size mismatch, expected
{expected_size} (metadata), found {list_size} (data type)"
+ )))
+ } else {
+ Ok(Self {
+ value_type: field.data_type().clone(),
+ metadata,
+ })
+ }
+ }
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "FixedShapeTensor data type mismatch, expected FixedSizeList
with non-nullable field, found {data_type}"
+ ))),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let fixed_shape_tensor = FixedShapeTensor::try_new(
+ DataType::Float32,
+ [100, 200, 500],
+ Some(vec!["C".to_owned(), "H".to_owned(), "W".to_owned()]),
+ Some(vec![2, 0, 1]),
+ )?;
+ let mut field = Field::new_fixed_size_list(
+ "",
+ Field::new("", DataType::Float32, false),
+ i32::try_from(fixed_shape_tensor.list_size()).expect("overflow"),
+ false,
+ );
+ field.try_with_extension_type(fixed_shape_tensor.clone())?;
+ assert_eq!(
+ field.try_extension_type::<FixedShapeTensor>()?,
+ fixed_shape_tensor
+ );
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::FixedShapeTensor(fixed_shape_tensor)
+ );
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field =
+ Field::new_fixed_size_list("", Field::new("", DataType::Float32,
false), 3, false)
+ .with_metadata(
+ [(
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ r#"{ "shape": [100, 200, 500], }"#.to_owned(),
+ )]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<FixedShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(expected = "FixedShapeTensor data type mismatch, expected
FixedSizeList")]
+ fn invalid_type() {
+ let fixed_shape_tensor =
+ FixedShapeTensor::try_new(DataType::Int32, [100, 200, 500], None,
None).unwrap();
+ let field = Field::new_fixed_size_list(
+ "",
+ Field::new("", DataType::Float32, false),
+ i32::try_from(fixed_shape_tensor.list_size()).expect("overflow"),
+ false,
+ );
+ field.with_extension_type(fixed_shape_tensor);
+ }
+
+ #[test]
+ #[should_panic(expected = "FixedShapeTensor extension types requires
metadata")]
+ fn missing_metadata() {
+ let field =
+ Field::new_fixed_size_list("", Field::new("", DataType::Float32,
false), 3, false)
+ .with_metadata(
+ [(
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ FixedShapeTensor::NAME.to_owned(),
+ )]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<FixedShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "FixedShapeTensor metadata deserialization failed: missing
field `shape`"
+ )]
+ fn invalid_metadata() {
+ let fixed_shape_tensor =
+ FixedShapeTensor::try_new(DataType::Float32, [100, 200, 500],
None, None).unwrap();
+ let field = Field::new_fixed_size_list(
+ "",
+ Field::new("", DataType::Float32, false),
+ i32::try_from(fixed_shape_tensor.list_size()).expect("overflow"),
+ false,
+ )
+ .with_metadata(
+ [
+ (
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ FixedShapeTensor::NAME.to_owned(),
+ ),
+ (
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ r#"{ "not-shape": [] }"#.to_owned(),
+ ),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<FixedShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "FixedShapeTensor dimension names size mismatch, expected
3, found 2"
+ )]
+ fn invalid_metadata_dimension_names() {
+ FixedShapeTensor::try_new(
+ DataType::Float32,
+ [100, 200, 500],
+ Some(vec!["a".to_owned(), "b".to_owned()]),
+ None,
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "FixedShapeTensor permutations size mismatch,
expected 3, found 2")]
+ fn invalid_metadata_permutations_len() {
+ FixedShapeTensor::try_new(DataType::Float32, [100, 200, 500], None,
Some(vec![1, 0]))
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "FixedShapeTensor permutations invalid, expected a
permutation of [0, 1, .., N-1], where N is the number of dimensions: 3"
+ )]
+ fn invalid_metadata_permutations_values() {
+ FixedShapeTensor::try_new(
+ DataType::Float32,
+ [100, 200, 500],
+ None,
+ Some(vec![4, 3, 2]),
+ )
+ .unwrap();
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/json.rs
b/arrow-schema/src/extension/canonical/json.rs
new file mode 100644
index 0000000000..0a8a1ae7e0
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/json.rs
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! JSON
+//!
+//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
+
+use serde::{Deserialize, Serialize};
+
+use crate::{extension::ExtensionType, ArrowError, DataType};
+
+/// The extension type for `JSON`.
+///
+/// Extension name: `arrow.json`.
+///
+/// The storage type of this extension is `String` or `LargeString` or
+/// `StringView`. Only UTF-8 encoded JSON as specified in
[rfc8259](https://datatracker.ietf.org/doc/html/rfc8259)
+/// is supported.
+///
+/// This type does not have any parameters.
+///
+/// Metadata is either an empty string or a JSON string with an empty
+/// object. In the future, additional fields may be added, but they are not
+/// required to interpret the array.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
+#[derive(Debug, Clone, Default, PartialEq)]
+pub struct Json(JsonMetadata);
+
+/// Empty object
+#[derive(Debug, Clone, Copy, PartialEq, Deserialize, Serialize)]
+#[serde(deny_unknown_fields)]
+struct Empty {}
+
+/// Extension type metadata for [`Json`].
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct JsonMetadata(Option<Empty>);
+
+impl ExtensionType for Json {
+ const NAME: &'static str = "arrow.json";
+
+ type Metadata = JsonMetadata;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &self.0
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ Some(
+ self.metadata()
+ .0
+ .as_ref()
+ .map(serde_json::to_string)
+ .map(Result::unwrap)
+ .unwrap_or_else(|| "".to_owned()),
+ )
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ const ERR: &str = "Json extension type metadata is either an empty
string or a JSON string with an empty object";
+ metadata
+ .map_or_else(
+ || Err(ArrowError::InvalidArgumentError(ERR.to_owned())),
+ |metadata| {
+ match metadata {
+ // Empty string
+ "" => Ok(None),
+ value => serde_json::from_str::<Empty>(value)
+ .map(Option::Some)
+ .map_err(|_|
ArrowError::InvalidArgumentError(ERR.to_owned())),
+ }
+ },
+ )
+ .map(JsonMetadata)
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ match data_type {
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View =>
Ok(()),
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "Json data type mismatch, expected one of Utf8, LargeUtf8,
Utf8View, found {data_type}"
+ ))),
+ }
+ }
+
+ fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self,
ArrowError> {
+ let json = Self(metadata);
+ json.supports_data_type(data_type)?;
+ Ok(json)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let mut field = Field::new("", DataType::Utf8, false);
+ field.try_with_extension_type(Json::default())?;
+ assert_eq!(
+ field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
+ Some(&"".to_owned())
+ );
+ assert_eq!(
+ field.try_extension_type::<Json>()?,
+ Json(JsonMetadata(None))
+ );
+
+ let mut field = Field::new("", DataType::LargeUtf8, false);
+ field.try_with_extension_type(Json(JsonMetadata(Some(Empty {}))))?;
+ assert_eq!(
+ field.metadata().get(EXTENSION_TYPE_METADATA_KEY),
+ Some(&"{}".to_owned())
+ );
+ assert_eq!(
+ field.try_extension_type::<Json>()?,
+ Json(JsonMetadata(Some(Empty {})))
+ );
+
+ let mut field = Field::new("", DataType::Utf8View, false);
+ field.try_with_extension_type(Json::default())?;
+ field.try_extension_type::<Json>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::Json(Json::default())
+ );
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = Field::new("", DataType::Int8, false).with_metadata(
+ [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "{}".to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Json>();
+ }
+
+ #[test]
+ #[should_panic(expected = "expected one of Utf8, LargeUtf8, Utf8View,
found Null")]
+ fn invalid_type() {
+ Field::new("", DataType::Null,
false).with_extension_type(Json::default());
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Json extension type metadata is either an empty string or
a JSON string with an empty object"
+ )]
+ fn invalid_metadata() {
+ let field = Field::new("", DataType::Utf8, false).with_metadata(
+ [
+ (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()),
+ (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Json>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Json extension type metadata is either an empty string or
a JSON string with an empty object"
+ )]
+ fn missing_metadata() {
+ let field = Field::new("", DataType::LargeUtf8, false).with_metadata(
+ [(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Json>();
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/mod.rs
b/arrow-schema/src/extension/canonical/mod.rs
new file mode 100644
index 0000000000..3d66299ca8
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/mod.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Canonical extension types.
+//!
+//! The Arrow columnar format allows defining extension types so as to extend
+//! standard Arrow data types with custom semantics. Often these semantics will
+//! be specific to a system or application. However, it is beneficial to share
+//! the definitions of well-known extension types so as to improve
+//! interoperability between different systems integrating Arrow columnar data.
+//!
+//!
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#format-canonical-extensions>
+
+mod bool8;
+pub use bool8::Bool8;
+mod fixed_shape_tensor;
+pub use fixed_shape_tensor::{FixedShapeTensor, FixedShapeTensorMetadata};
+mod json;
+pub use json::{Json, JsonMetadata};
+mod opaque;
+pub use opaque::{Opaque, OpaqueMetadata};
+mod uuid;
+pub use uuid::Uuid;
+mod variable_shape_tensor;
+pub use variable_shape_tensor::{VariableShapeTensor,
VariableShapeTensorMetadata};
+
+use crate::{ArrowError, Field};
+
+use super::ExtensionType;
+
+/// Canonical extension types.
+///
+///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#format-canonical-extensions>
+#[non_exhaustive]
+#[derive(Debug, Clone, PartialEq)]
+pub enum CanonicalExtensionType {
+ /// The extension type for `FixedShapeTensor`.
+ ///
+ ///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor>
+ FixedShapeTensor(FixedShapeTensor),
+
+ /// The extension type for `VariableShapeTensor`.
+ ///
+ ///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#variable-shape-tensor>
+ VariableShapeTensor(VariableShapeTensor),
+
+ /// The extension type for 'JSON'.
+ ///
+ /// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#json>
+ Json(Json),
+
+ /// The extension type for `UUID`.
+ ///
+ /// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
+ Uuid(Uuid),
+
+ /// The extension type for `Opaque`.
+ ///
+ /// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#opaque>
+ Opaque(Opaque),
+
+ /// The extension type for `Bool8`.
+ ///
+ ///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
+ Bool8(Bool8),
+}
+
+impl TryFrom<&Field> for CanonicalExtensionType {
+ type Error = ArrowError;
+
+ fn try_from(value: &Field) -> Result<Self, Self::Error> {
+ // Canonical extension type names start with `arrow.`
+ match value.extension_type_name() {
+ // An extension type name with an `arrow.` prefix
+ Some(name) if name.starts_with("arrow.") => match name {
+ FixedShapeTensor::NAME =>
value.try_extension_type::<FixedShapeTensor>().map(Into::into),
+ VariableShapeTensor::NAME =>
value.try_extension_type::<VariableShapeTensor>().map(Into::into),
+ Json::NAME =>
value.try_extension_type::<Json>().map(Into::into),
+ Uuid::NAME =>
value.try_extension_type::<Uuid>().map(Into::into),
+ Opaque::NAME =>
value.try_extension_type::<Opaque>().map(Into::into),
+ Bool8::NAME =>
value.try_extension_type::<Bool8>().map(Into::into),
+ _ => Err(ArrowError::InvalidArgumentError(format!("Unsupported
canonical extension type: {name}"))),
+ },
+ // Name missing the expected prefix
+ Some(name) => Err(ArrowError::InvalidArgumentError(format!(
+ "Field extension type name mismatch, expected a name with an
`arrow.` prefix, found {name}"
+ ))),
+ // Name missing
+ None => Err(ArrowError::InvalidArgumentError("Field extension type
name missing".to_owned())),
+ }
+ }
+}
+
+impl From<FixedShapeTensor> for CanonicalExtensionType {
+ fn from(value: FixedShapeTensor) -> Self {
+ CanonicalExtensionType::FixedShapeTensor(value)
+ }
+}
+
+impl From<VariableShapeTensor> for CanonicalExtensionType {
+ fn from(value: VariableShapeTensor) -> Self {
+ CanonicalExtensionType::VariableShapeTensor(value)
+ }
+}
+
+impl From<Json> for CanonicalExtensionType {
+ fn from(value: Json) -> Self {
+ CanonicalExtensionType::Json(value)
+ }
+}
+
+impl From<Uuid> for CanonicalExtensionType {
+ fn from(value: Uuid) -> Self {
+ CanonicalExtensionType::Uuid(value)
+ }
+}
+
+impl From<Opaque> for CanonicalExtensionType {
+ fn from(value: Opaque) -> Self {
+ CanonicalExtensionType::Opaque(value)
+ }
+}
+
+impl From<Bool8> for CanonicalExtensionType {
+ fn from(value: Bool8) -> Self {
+ CanonicalExtensionType::Bool8(value)
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/opaque.rs
b/arrow-schema/src/extension/canonical/opaque.rs
new file mode 100644
index 0000000000..1db7265cfd
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/opaque.rs
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Opaque
+//!
+//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#opaque>
+
+use serde::{Deserialize, Serialize};
+
+use crate::{extension::ExtensionType, ArrowError, DataType};
+
+/// The extension type for `Opaque`.
+///
+/// Extension name: `arrow.opaque`.
+///
+/// Opaque represents a type that an Arrow-based system received from an
+/// external (often non-Arrow) system, but that it cannot interpret. In this
+/// case, it can pass on Opaque to its clients to at least show that a field
+/// exists and preserve metadata about the type from the other system.
+///
+/// The storage type of this extension is any type. If there is no underlying
+/// data, the storage type should be Null.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Opaque(OpaqueMetadata);
+
+impl Opaque {
+ /// Returns a new `Opaque` extension type.
+ pub fn new(type_name: impl Into<String>, vendor_name: impl Into<String>)
-> Self {
+ Self(OpaqueMetadata::new(type_name, vendor_name))
+ }
+
+ /// Returns the name of the unknown type in the external system.
+ pub fn type_name(&self) -> &str {
+ self.0.type_name()
+ }
+
+ /// Returns the name of the external system.
+ pub fn vendor_name(&self) -> &str {
+ self.0.vendor_name()
+ }
+}
+
+impl From<OpaqueMetadata> for Opaque {
+ fn from(value: OpaqueMetadata) -> Self {
+ Self(value)
+ }
+}
+
+/// Extension type metadata for [`Opaque`].
+#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
+pub struct OpaqueMetadata {
+ /// Name of the unknown type in the external system.
+ type_name: String,
+
+ /// Name of the external system.
+ vendor_name: String,
+}
+
+impl OpaqueMetadata {
+ /// Returns a new `OpaqueMetadata`.
+ pub fn new(type_name: impl Into<String>, vendor_name: impl Into<String>)
-> Self {
+ OpaqueMetadata {
+ type_name: type_name.into(),
+ vendor_name: vendor_name.into(),
+ }
+ }
+
+ /// Returns the name of the unknown type in the external system.
+ pub fn type_name(&self) -> &str {
+ &self.type_name
+ }
+
+ /// Returns the name of the external system.
+ pub fn vendor_name(&self) -> &str {
+ &self.vendor_name
+ }
+}
+
+impl ExtensionType for Opaque {
+ const NAME: &'static str = "arrow.opaque";
+
+ type Metadata = OpaqueMetadata;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &self.0
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ Some(serde_json::to_string(self.metadata()).expect("metadata
serialization"))
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ metadata.map_or_else(
+ || {
+ Err(ArrowError::InvalidArgumentError(
+ "Opaque extension types requires metadata".to_owned(),
+ ))
+ },
+ |value| {
+ serde_json::from_str(value).map_err(|e| {
+ ArrowError::InvalidArgumentError(format!(
+ "Opaque metadata deserialization failed: {e}"
+ ))
+ })
+ },
+ )
+ }
+
+ fn supports_data_type(&self, _data_type: &DataType) -> Result<(),
ArrowError> {
+ // Any type
+ Ok(())
+ }
+
+ fn try_new(_data_type: &DataType, metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ Ok(Self::from(metadata))
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let opaque = Opaque::new("name", "vendor");
+ let mut field = Field::new("", DataType::Null, false);
+ field.try_with_extension_type(opaque.clone())?;
+ assert_eq!(field.try_extension_type::<Opaque>()?, opaque);
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::Opaque(opaque)
+ );
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = Field::new("", DataType::Null, false).with_metadata(
+ [(
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ r#"{ "type_name": "type", "vendor_name": "vendor"
}"#.to_owned(),
+ )]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Opaque>();
+ }
+
+ #[test]
+ #[should_panic(expected = "Opaque extension types requires metadata")]
+ fn missing_metadata() {
+ let field = Field::new("", DataType::Null, false).with_metadata(
+ [(EXTENSION_TYPE_NAME_KEY.to_owned(), Opaque::NAME.to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Opaque>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Opaque metadata deserialization failed: missing field
`vendor_name`"
+ )]
+ fn invalid_metadata() {
+ let field = Field::new("", DataType::Null, false).with_metadata(
+ [
+ (EXTENSION_TYPE_NAME_KEY.to_owned(), Opaque::NAME.to_owned()),
+ (
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ r#"{ "type_name": "no-vendor" }"#.to_owned(),
+ ),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Opaque>();
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/uuid.rs
b/arrow-schema/src/extension/canonical/uuid.rs
new file mode 100644
index 0000000000..8b2e71b7b5
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/uuid.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! UUID
+//!
+//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
+
+use crate::{extension::ExtensionType, ArrowError, DataType};
+
+/// The extension type for `UUID`.
+///
+/// Extension name: `arrow.uuid`.
+///
+/// The storage type of the extension is `FixedSizeBinary` with a length of
+/// 16 bytes.
+///
+/// Note:
+/// A specific UUID version is not required or guaranteed. This extension
+/// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and
+/// does not interpret the bytes in any way.
+///
+/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid>
+#[derive(Debug, Default, Clone, Copy, PartialEq)]
+pub struct Uuid;
+
+impl ExtensionType for Uuid {
+ const NAME: &'static str = "arrow.uuid";
+
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ metadata.map_or_else(
+ || Ok(()),
+ |_| {
+ Err(ArrowError::InvalidArgumentError(
+ "Uuid extension type expects no metadata".to_owned(),
+ ))
+ },
+ )
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ match data_type {
+ DataType::FixedSizeBinary(16) => Ok(()),
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "Uuid data type mismatch, expected FixedSizeBinary(16), found
{data_type}"
+ ))),
+ }
+ }
+
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ Self.supports_data_type(data_type).map(|_| Self)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let mut field = Field::new("", DataType::FixedSizeBinary(16), false);
+ field.try_with_extension_type(Uuid)?;
+ field.try_extension_type::<Uuid>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::Uuid(Uuid)
+ );
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = Field::new("", DataType::FixedSizeBinary(16), false);
+ field.extension_type::<Uuid>();
+ }
+
+ #[test]
+ #[should_panic(expected = "expected FixedSizeBinary(16), found
FixedSizeBinary(8)")]
+ fn invalid_type() {
+ Field::new("", DataType::FixedSizeBinary(8),
false).with_extension_type(Uuid);
+ }
+
+ #[test]
+ #[should_panic(expected = "Uuid extension type expects no metadata")]
+ fn with_metadata() {
+ let field = Field::new("", DataType::FixedSizeBinary(16),
false).with_metadata(
+ [
+ (EXTENSION_TYPE_NAME_KEY.to_owned(), Uuid::NAME.to_owned()),
+ (EXTENSION_TYPE_METADATA_KEY.to_owned(), "".to_owned()),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<Uuid>();
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/variable_shape_tensor.rs
b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs
new file mode 100644
index 0000000000..804591776b
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/variable_shape_tensor.rs
@@ -0,0 +1,551 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! VariableShapeTensor
+//!
+//!
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#variable-shape-tensor>
+
+use serde::{Deserialize, Serialize};
+
+use crate::{extension::ExtensionType, ArrowError, DataType, Field};
+
+/// The extension type for `VariableShapeTensor`.
+///
+/// Extension name: `arrow.variable_shape_tensor`.
+///
+/// The storage type of the extension is: StructArray where struct is composed
+/// of data and shape fields describing a single tensor per row:
+/// - `data` is a List holding tensor elements (each list element is a single
+/// tensor). The List’s value type is the value type of the tensor, such as
+/// an integer or floating-point type.
+/// - `shape` is a `FixedSizeList<int32>[ndim]` of the tensor shape where the
+/// size of the list `ndim` is equal to the number of dimensions of the
+/// tensor.
+///
+/// Extension type parameters:
+/// `value_type`: the Arrow data type of individual tensor elements.
+///
+/// Optional parameters describing the logical layout:
+/// - `dim_names`: explicit names to tensor dimensions as an array. The length
+/// of it should be equal to the shape length and equal to the number of
+/// dimensions.
+/// `dim_names` can be used if the dimensions have well-known names and they
+/// map to the physical layout (row-major).
+/// - `permutation`: indices of the desired ordering of the original
+/// dimensions, defined as an array.
+/// The indices contain a permutation of the values `[0, 1, .., N-1]` where
+/// `N` is the number of dimensions. The permutation indicates which
+/// dimension of the logical layout corresponds to which dimension of the
+/// physical tensor (the i-th dimension of the logical view corresponds to
+/// the dimension with number `permutations[i]` of the physical tensor).
+/// Permutation can be useful in case the logical order of the tensor is a
+/// permutation of the physical order (row-major).
+/// When logical and physical layout are equal, the permutation will always
+/// be (`[0, 1, .., N-1]`) and can therefore be left out.
+/// - `uniform_shape`: sizes of individual tensor’s dimensions which are
+/// guaranteed to stay constant in uniform dimensions and can vary in non-
+/// uniform dimensions. This holds over all tensors in the array. Sizes in
+/// uniform dimensions are represented with int32 values, while sizes of the
+/// non-uniform dimensions are not known in advance and are represented with
+/// null. If `uniform_shape` is not provided it is assumed that all
+/// dimensions are non-uniform. An array containing a tensor with shape (2,
+/// 3, 4) and whose first and last dimensions are uniform would have
+/// `uniform_shape` (2, null, 4). This allows for interpreting the tensor
+/// correctly without accounting for uniform dimensions while still
+/// permitting optional optimizations that take advantage of the uniformity.
+///
+///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#variable-shape-tensor>
+#[derive(Debug, Clone, PartialEq)]
+pub struct VariableShapeTensor {
+ /// The data type of individual tensor elements.
+ value_type: DataType,
+
+ /// The number of dimensions of the tensor.
+ dimensions: usize,
+
+ /// The metadata of this extension type.
+ metadata: VariableShapeTensorMetadata,
+}
+
+impl VariableShapeTensor {
+ /// Returns a new variable shape tensor extension type.
+ ///
+ /// # Error
+ ///
+ /// Return an error if the provided dimension names, permutations or
+ /// uniform shapes are invalid.
+ pub fn try_new(
+ value_type: DataType,
+ dimensions: usize,
+ dimension_names: Option<Vec<String>>,
+ permutations: Option<Vec<usize>>,
+ uniform_shapes: Option<Vec<Option<i32>>>,
+ ) -> Result<Self, ArrowError> {
+ // TODO: are all data types are suitable as value type?
+ VariableShapeTensorMetadata::try_new(
+ dimensions,
+ dimension_names,
+ permutations,
+ uniform_shapes,
+ )
+ .map(|metadata| Self {
+ value_type,
+ dimensions,
+ metadata,
+ })
+ }
+
+ /// Returns the value type of the individual tensor elements.
+ pub fn value_type(&self) -> &DataType {
+ &self.value_type
+ }
+
+ /// Returns the number of dimensions in this variable shape tensor.
+ pub fn dimensions(&self) -> usize {
+ self.dimensions
+ }
+
+ /// Returns the names of the dimensions in this variable shape tensor, if
+ /// set.
+ pub fn dimension_names(&self) -> Option<&[String]> {
+ self.metadata.dimension_names()
+ }
+
+ /// Returns the indices of the desired ordering of the original
+ /// dimensions, if set.
+ pub fn permutations(&self) -> Option<&[usize]> {
+ self.metadata.permutations()
+ }
+
+ /// Returns sizes of individual tensor’s dimensions which are guaranteed
+ /// to stay constant in uniform dimensions and can vary in non-uniform
+ /// dimensions.
+ pub fn uniform_shapes(&self) -> Option<&[Option<i32>]> {
+ self.metadata.uniform_shapes()
+ }
+}
+
+/// Extension type metadata for [`VariableShapeTensor`].
+#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
+pub struct VariableShapeTensorMetadata {
+ /// Explicit names to tensor dimensions.
+ dim_names: Option<Vec<String>>,
+
+ /// Indices of the desired ordering of the original dimensions.
+ permutations: Option<Vec<usize>>,
+
+ /// Sizes of individual tensor’s dimensions which are guaranteed to stay
+ /// constant in uniform dimensions and can vary in non-uniform dimensions.
+ uniform_shape: Option<Vec<Option<i32>>>,
+}
+
+impl VariableShapeTensorMetadata {
+ /// Returns metadata for a variable shape tensor extension type.
+ ///
+ /// # Error
+ ///
+ /// Return an error if the provided dimension names, permutations or
+ /// uniform shapes are invalid.
+ pub fn try_new(
+ dimensions: usize,
+ dimension_names: Option<Vec<String>>,
+ permutations: Option<Vec<usize>>,
+ uniform_shapes: Option<Vec<Option<i32>>>,
+ ) -> Result<Self, ArrowError> {
+ let dim_names = dimension_names.map(|dimension_names| {
+ if dimension_names.len() != dimensions {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor dimension names size mismatch,
expected {dimensions}, found {}", dimension_names.len()
+ )))
+ } else {
+ Ok(dimension_names)
+ }
+ }).transpose()?;
+
+ let permutations = permutations
+ .map(|permutations| {
+ if permutations.len() != dimensions {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor permutations size mismatch,
expected {dimensions}, found {}",
+ permutations.len()
+ )))
+ } else {
+ let mut sorted_permutations = permutations.clone();
+ sorted_permutations.sort_unstable();
+ if (0..dimensions).zip(sorted_permutations).any(|(a, b)| a
!= b) {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor permutations invalid,
expected a permutation of [0, 1, .., N-1], where N is the number of dimensions:
{dimensions}"
+ )))
+ } else {
+ Ok(permutations)
+ }
+ }
+ })
+ .transpose()?;
+
+ let uniform_shape = uniform_shapes
+ .map(|uniform_shapes| {
+ if uniform_shapes.len() != dimensions {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor uniform shapes size mismatch,
expected {dimensions}, found {}",
+ uniform_shapes.len()
+ )))
+ } else {
+ Ok(uniform_shapes)
+ }
+ })
+ .transpose()?;
+
+ Ok(Self {
+ dim_names,
+ permutations,
+ uniform_shape,
+ })
+ }
+
+ /// Returns the names of the dimensions in this variable shape tensor, if
+ /// set.
+ pub fn dimension_names(&self) -> Option<&[String]> {
+ self.dim_names.as_ref().map(AsRef::as_ref)
+ }
+
+ /// Returns the indices of the desired ordering of the original dimensions,
+ /// if set.
+ pub fn permutations(&self) -> Option<&[usize]> {
+ self.permutations.as_ref().map(AsRef::as_ref)
+ }
+
+ /// Returns sizes of individual tensor’s dimensions which are guaranteed
+ /// to stay constant in uniform dimensions and can vary in non-uniform
+ /// dimensions.
+ pub fn uniform_shapes(&self) -> Option<&[Option<i32>]> {
+ self.uniform_shape.as_ref().map(AsRef::as_ref)
+ }
+}
+
+impl ExtensionType for VariableShapeTensor {
+ const NAME: &'static str = "arrow.variable_shape_tensor";
+
+ type Metadata = VariableShapeTensorMetadata;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &self.metadata
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ Some(serde_json::to_string(self.metadata()).expect("metadata
serialization"))
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ metadata.map_or_else(
+ || {
+ Err(ArrowError::InvalidArgumentError(
+ "VariableShapeTensor extension types requires
metadata".to_owned(),
+ ))
+ },
+ |value| {
+ serde_json::from_str(value).map_err(|e| {
+ ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor metadata deserialization failed:
{e}"
+ ))
+ })
+ },
+ )
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ let expected = DataType::Struct(
+ [
+ Field::new_list(
+ "data",
+ Field::new_list_field(self.value_type.clone(), false),
+ false,
+ ),
+ Field::new(
+ "shape",
+ DataType::new_fixed_size_list(
+ DataType::Int32,
+ i32::try_from(self.dimensions()).expect("overflow"),
+ false,
+ ),
+ false,
+ ),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ data_type
+ .equals_datatype(&expected)
+ .then_some(())
+ .ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor data type mismatch, expected
{expected}, found {data_type}"
+ ))
+ })
+ }
+
+ fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self,
ArrowError> {
+ match data_type {
+ DataType::Struct(fields)
+ if fields.len() == 2
+ && matches!(fields.find("data"), Some((0, _)))
+ && matches!(fields.find("shape"), Some((1, _))) =>
+ {
+ let shape_field = &fields[1];
+ match shape_field.data_type() {
+ DataType::FixedSizeList(_, list_size) => {
+ let dimensions =
usize::try_from(*list_size).expect("conversion failed");
+ // Make sure the metadata is valid.
+ let metadata =
VariableShapeTensorMetadata::try_new(dimensions, metadata.dim_names,
metadata.permutations, metadata.uniform_shape)?;
+ let data_field = &fields[0];
+ match data_field.data_type() {
+ DataType::List(field) => {
+ Ok(Self {
+ value_type: field.data_type().clone(),
+ dimensions,
+ metadata
+ })
+ }
+ data_type =>
Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor data type mismatch,
expected List for data field, found {data_type}"
+ ))),
+ }
+ }
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor data type mismatch, expected
FixedSizeList for shape field, found {data_type}"
+ ))),
+ }
+ }
+ data_type => Err(ArrowError::InvalidArgumentError(format!(
+ "VariableShapeTensor data type mismatch, expected Struct with
2 fields (data and shape), found {data_type}"
+ ))),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ Field,
+ };
+
+ use super::*;
+
+ #[test]
+ fn valid() -> Result<(), ArrowError> {
+ let variable_shape_tensor = VariableShapeTensor::try_new(
+ DataType::Float32,
+ 3,
+ Some(vec!["C".to_owned(), "H".to_owned(), "W".to_owned()]),
+ Some(vec![2, 0, 1]),
+ Some(vec![Some(400), None, Some(3)]),
+ )?;
+ let mut field = Field::new_struct(
+ "",
+ vec![
+ Field::new_list(
+ "data",
+ Field::new_list_field(DataType::Float32, false),
+ false,
+ ),
+ Field::new_fixed_size_list(
+ "shape",
+ Field::new("", DataType::Int32, false),
+ 3,
+ false,
+ ),
+ ],
+ false,
+ );
+ field.try_with_extension_type(variable_shape_tensor.clone())?;
+ assert_eq!(
+ field.try_extension_type::<VariableShapeTensor>()?,
+ variable_shape_tensor
+ );
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+ CanonicalExtensionType::VariableShapeTensor(variable_shape_tensor)
+ );
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = Field::new_struct(
+ "",
+ vec![
+ Field::new_list(
+ "data",
+ Field::new_list_field(DataType::Float32, false),
+ false,
+ ),
+ Field::new_fixed_size_list(
+ "shape",
+ Field::new("", DataType::Int32, false),
+ 3,
+ false,
+ ),
+ ],
+ false,
+ )
+ .with_metadata(
+ [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "{}".to_owned())]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<VariableShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(expected = "VariableShapeTensor data type mismatch,
expected Struct")]
+ fn invalid_type() {
+ let variable_shape_tensor =
+ VariableShapeTensor::try_new(DataType::Int32, 3, None, None,
None).unwrap();
+ let field = Field::new_struct(
+ "",
+ vec![
+ Field::new_list(
+ "data",
+ Field::new_list_field(DataType::Float32, false),
+ false,
+ ),
+ Field::new_fixed_size_list(
+ "shape",
+ Field::new("", DataType::Int32, false),
+ 3,
+ false,
+ ),
+ ],
+ false,
+ );
+ field.with_extension_type(variable_shape_tensor);
+ }
+
+ #[test]
+ #[should_panic(expected = "VariableShapeTensor extension types requires
metadata")]
+ fn missing_metadata() {
+ let field = Field::new_struct(
+ "",
+ vec![
+ Field::new_list(
+ "data",
+ Field::new_list_field(DataType::Float32, false),
+ false,
+ ),
+ Field::new_fixed_size_list(
+ "shape",
+ Field::new("", DataType::Int32, false),
+ 3,
+ false,
+ ),
+ ],
+ false,
+ )
+ .with_metadata(
+ [(
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ VariableShapeTensor::NAME.to_owned(),
+ )]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<VariableShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(expected = "VariableShapeTensor metadata deserialization
failed: invalid type:")]
+ fn invalid_metadata() {
+ let field = Field::new_struct(
+ "",
+ vec![
+ Field::new_list(
+ "data",
+ Field::new_list_field(DataType::Float32, false),
+ false,
+ ),
+ Field::new_fixed_size_list(
+ "shape",
+ Field::new("", DataType::Int32, false),
+ 3,
+ false,
+ ),
+ ],
+ false,
+ )
+ .with_metadata(
+ [
+ (
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ VariableShapeTensor::NAME.to_owned(),
+ ),
+ (
+ EXTENSION_TYPE_METADATA_KEY.to_owned(),
+ r#"{ "dim_names": [1, null, 3, 4] }"#.to_owned(),
+ ),
+ ]
+ .into_iter()
+ .collect(),
+ );
+ field.extension_type::<VariableShapeTensor>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "VariableShapeTensor dimension names size mismatch,
expected 3, found 2"
+ )]
+ fn invalid_metadata_dimension_names() {
+ VariableShapeTensor::try_new(
+ DataType::Float32,
+ 3,
+ Some(vec!["a".to_owned(), "b".to_owned()]),
+ None,
+ None,
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "VariableShapeTensor permutations size mismatch, expected
3, found 2"
+ )]
+ fn invalid_metadata_permutations_len() {
+ VariableShapeTensor::try_new(DataType::Float32, 3, None, Some(vec![1,
0]), None).unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "VariableShapeTensor permutations invalid, expected a
permutation of [0, 1, .., N-1], where N is the number of dimensions: 3"
+ )]
+ fn invalid_metadata_permutations_values() {
+ VariableShapeTensor::try_new(DataType::Float32, 3, None, Some(vec![4,
3, 2]), None)
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "VariableShapeTensor uniform shapes size mismatch, expected
3, found 2"
+ )]
+ fn invalid_metadata_uniform_shapes() {
+ VariableShapeTensor::try_new(DataType::Float32, 3, None, None,
Some(vec![None, Some(1)]))
+ .unwrap();
+ }
+}
diff --git a/arrow-schema/src/extension/mod.rs
b/arrow-schema/src/extension/mod.rs
new file mode 100644
index 0000000000..c5119873af
--- /dev/null
+++ b/arrow-schema/src/extension/mod.rs
@@ -0,0 +1,260 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Extension types.
+//!
+//! <div class="warning">This module is experimental. There might be breaking
changes between minor releases.</div>
+
+#[cfg(feature = "canonical_extension_types")]
+mod canonical;
+#[cfg(feature = "canonical_extension_types")]
+pub use canonical::*;
+
+use crate::{ArrowError, DataType};
+
+/// The metadata key for the string name identifying an [`ExtensionType`].
+pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name";
+
+/// The metadata key for a serialized representation of the [`ExtensionType`]
+/// necessary to reconstruct the custom type.
+pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata";
+
+/// Extension types.
+///
+/// User-defined “extension” types can be defined setting certain key value
+/// pairs in the [`Field`] metadata structure. These extension keys are:
+/// - [`EXTENSION_TYPE_NAME_KEY`]
+/// - [`EXTENSION_TYPE_METADATA_KEY`]
+///
+/// Canonical extension types support in this crate requires the
+/// `canonical_extension_types` feature.
+///
+/// Extension types may or may not use the [`EXTENSION_TYPE_METADATA_KEY`]
+/// field.
+///
+/// # Example
+///
+/// The example below demonstrates how to implement this trait for a `Uuid`
+/// type. Note this is not the canonical extension type for `Uuid`, which does
+/// not include information about the `Uuid` version.
+///
+/// ```
+/// # use arrow_schema::ArrowError;
+/// # fn main() -> Result<(), ArrowError> {
+/// use arrow_schema::{DataType, extension::ExtensionType, Field};
+/// use std::{fmt, str::FromStr};
+///
+/// /// The different Uuid versions.
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// enum UuidVersion {
+/// V1,
+/// V2,
+/// V3,
+/// V4,
+/// V5,
+/// V6,
+/// V7,
+/// V8,
+/// }
+///
+/// // We'll use `Display` to serialize.
+/// impl fmt::Display for UuidVersion {
+/// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+/// write!(
+/// f,
+/// "{}",
+/// match self {
+/// Self::V1 => "V1",
+/// Self::V2 => "V2",
+/// Self::V3 => "V3",
+/// Self::V4 => "V4",
+/// Self::V5 => "V5",
+/// Self::V6 => "V6",
+/// Self::V7 => "V7",
+/// Self::V8 => "V8",
+/// }
+/// )
+/// }
+/// }
+///
+/// // And `FromStr` to deserialize.
+/// impl FromStr for UuidVersion {
+/// type Err = ArrowError;
+///
+/// fn from_str(s: &str) -> Result<Self, Self::Err> {
+/// match s {
+/// "V1" => Ok(Self::V1),
+/// "V2" => Ok(Self::V2),
+/// "V3" => Ok(Self::V3),
+/// "V4" => Ok(Self::V4),
+/// "V5" => Ok(Self::V5),
+/// "V6" => Ok(Self::V6),
+/// "V7" => Ok(Self::V7),
+/// "V8" => Ok(Self::V8),
+/// _ => Err(ArrowError::ParseError("Invalid
UuidVersion".to_owned())),
+/// }
+/// }
+/// }
+///
+/// /// This is the extension type, not the container for Uuid values. It
+/// /// stores the Uuid version (this is the metadata of this extension type).
+/// #[derive(Clone, Copy, Debug, PartialEq)]
+/// struct Uuid(UuidVersion);
+///
+/// impl ExtensionType for Uuid {
+/// // We use a namespace as suggested by the specification.
+/// const NAME: &'static str = "myorg.example.uuid";
+///
+/// // The metadata type is the Uuid version.
+/// type Metadata = UuidVersion;
+///
+/// // We just return a reference to the Uuid version.
+/// fn metadata(&self) -> &Self::Metadata {
+/// &self.0
+/// }
+///
+/// // We use the `Display` implementation to serialize the Uuid
+/// // version.
+/// fn serialize_metadata(&self) -> Option<String> {
+/// Some(self.0.to_string())
+/// }
+///
+/// // We use the `FromStr` implementation to deserialize the Uuid
+/// // version.
+/// fn deserialize_metadata(metadata: Option<&str>) ->
Result<Self::Metadata, ArrowError> {
+/// metadata.map_or_else(
+/// || {
+/// Err(ArrowError::InvalidArgumentError(
+/// "Uuid extension type metadata missing".to_owned(),
+/// ))
+/// },
+/// str::parse,
+/// )
+/// }
+///
+/// // The only supported data type is `FixedSizeBinary(16)`.
+/// fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+/// match data_type {
+/// DataType::FixedSizeBinary(16) => Ok(()),
+/// data_type => Err(ArrowError::InvalidArgumentError(format!(
+/// "Uuid data type mismatch, expected FixedSizeBinary(16),
found {data_type}"
+/// ))),
+/// }
+/// }
+///
+/// // We should always check if the data type is supported before
+/// // constructing the extension type.
+/// fn try_new(data_type: &DataType, metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+/// let uuid = Self(metadata);
+/// uuid.supports_data_type(data_type)?;
+/// Ok(uuid)
+/// }
+/// }
+///
+/// // We can now construct the extension type.
+/// let uuid_v1 = Uuid(UuidVersion::V1);
+///
+/// // And add it to a field.
+/// let mut field =
+/// Field::new("", DataType::FixedSizeBinary(16),
false).with_extension_type(uuid_v1);
+///
+/// // And extract it from this field.
+/// assert_eq!(field.try_extension_type::<Uuid>()?, uuid_v1);
+///
+/// // When we try to add this to a field with an unsupported data type we
+/// // get an error.
+/// let result = Field::new("", DataType::Null,
false).try_with_extension_type(uuid_v1);
+/// assert!(result.is_err());
+/// # Ok(()) }
+/// ```
+///
+/// <https://arrow.apache.org/docs/format/Columnar.html#extension-types>
+///
+/// [`Field`]: crate::Field
+pub trait ExtensionType: Sized {
+ /// The name identifying this extension type.
+ ///
+ /// This is the string value that is used for the
+ /// [`EXTENSION_TYPE_NAME_KEY`] in the [`Field::metadata`] of a [`Field`]
+ /// to identify this extension type.
+ ///
+ /// We recommend that you use a “namespace”-style prefix for extension
+ /// type names to minimize the possibility of conflicts with multiple Arrow
+ /// readers and writers in the same application. For example, use
+ /// `myorg.name_of_type` instead of simply `name_of_type`.
+ ///
+ /// Extension names beginning with `arrow.` are reserved for canonical
+ /// extension types, they should not be used for third-party extension
+ /// types.
+ ///
+ /// Extension names are case-sensitive.
+ ///
+ /// [`Field`]: crate::Field
+ /// [`Field::metadata`]: crate::Field::metadata
+ const NAME: &'static str;
+
+ /// The metadata type of this extension type.
+ ///
+ /// Implementations can use strongly or loosly typed data structures here
+ /// depending on the complexity of the metadata.
+ ///
+ /// Implementations can also use `Self` here if the extension type can be
+ /// constructed directly from its metadata.
+ ///
+ /// If an extension type defines no metadata it should use `()` to indicate
+ /// this.
+ type Metadata;
+
+ /// Returns a reference to the metadata of this extension type, or `&()` if
+ /// if this extension type defines no metadata (`Self::Metadata=()`).
+ fn metadata(&self) -> &Self::Metadata;
+
+ /// Returns the serialized representation of the metadata of this extension
+ /// type, or `None` if this extension type defines no metadata
+ /// (`Self::Metadata=()`).
+ ///
+ /// This is string value that is used for the
+ /// [`EXTENSION_TYPE_METADATA_KEY`] in the [`Field::metadata`] of a
+ /// [`Field`].
+ ///
+ /// [`Field`]: crate::Field
+ /// [`Field::metadata`]: crate::Field::metadata
+ fn serialize_metadata(&self) -> Option<String>;
+
+ /// Deserialize the metadata of this extension type from the serialized
+ /// representation of the metadata. An extension type that defines no
+ /// metadata should expect `None` for the serialized metadata and return
+ /// `Ok(())`.
+ ///
+ /// This function should return an error when
+ /// - expected metadata is missing (for extensions types with non-optional
+ /// metadata)
+ /// - unexpected metadata is set (for extension types without metadata)
+ /// - deserialization of metadata fails
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError>;
+
+ /// Returns `OK())` iff the given data type is supported by this extension
+ /// type.
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError>;
+
+ /// Construct this extension type for a field with the given data type and
+ /// metadata.
+ ///
+ /// This should return an error if the given data type is not supported by
+ /// this extension type.
+ fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self,
ArrowError>;
+}
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index 13bb7abf51..dbd671a62a 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -22,8 +22,13 @@ use std::hash::{Hash, Hasher};
use std::sync::Arc;
use crate::datatype::DataType;
+#[cfg(feature = "canonical_extension_types")]
+use crate::extension::CanonicalExtensionType;
use crate::schema::SchemaBuilder;
-use crate::{Fields, UnionFields, UnionMode};
+use crate::{
+ extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY,
EXTENSION_TYPE_NAME_KEY},
+ Fields, UnionFields, UnionMode,
+};
/// A reference counted [`Field`]
pub type FieldRef = Arc<Field>;
@@ -350,6 +355,167 @@ impl Field {
self
}
+ /// Returns the extension type name of this [`Field`], if set.
+ ///
+ /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in
+ /// [`Field::metadata`]. If the key is missing, there is no extension type
+ /// name and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY,
Field};
+ ///
+ /// let field = Field::new("", DataType::Null, false);
+ /// assert_eq!(field.extension_type_name(), None);
+ ///
+ /// let field = Field::new("", DataType::Null, false).with_metadata(
+ /// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())]
+ /// .into_iter()
+ /// .collect(),
+ /// );
+ /// assert_eq!(field.extension_type_name(), Some("example"));
+ /// ```
+ pub fn extension_type_name(&self) -> Option<&str> {
+ self.metadata()
+ .get(EXTENSION_TYPE_NAME_KEY)
+ .map(String::as_ref)
+ }
+
+ /// Returns the extension type metadata of this [`Field`], if set.
+ ///
+ /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in
+ /// [`Field::metadata`]. If the key is missing, there is no extension type
+ /// metadata and this returns `None`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY,
Field};
+ ///
+ /// let field = Field::new("", DataType::Null, false);
+ /// assert_eq!(field.extension_type_metadata(), None);
+ ///
+ /// let field = Field::new("", DataType::Null, false).with_metadata(
+ /// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())]
+ /// .into_iter()
+ /// .collect(),
+ /// );
+ /// assert_eq!(field.extension_type_metadata(), Some("example"));
+ /// ```
+ pub fn extension_type_metadata(&self) -> Option<&str> {
+ self.metadata()
+ .get(EXTENSION_TYPE_METADATA_KEY)
+ .map(String::as_ref)
+ }
+
+ /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
+ /// if set in the [`Field::metadata`].
+ ///
+ /// # Error
+ ///
+ /// Returns an error if
+ /// - this field does not have the name of this extension type
+ /// ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or
+ /// missing)
+ /// - the deserialization of the metadata
+ /// ([`ExtensionType::deserialize_metadata`]) fails
+ /// - the construction of the extension type ([`ExtensionType::try_new`])
+ /// fail (for example when the [`Field::data_type`] is not supported by
+ /// the extension type ([`ExtensionType::supports_data_type`]))
+ pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E,
ArrowError> {
+ // Check the extension name in the metadata
+ match self.extension_type_name() {
+ // It should match the name of the given extension type
+ Some(name) if name == E::NAME => {
+ // Deserialize the metadata and try to construct the extension
+ // type
+ E::deserialize_metadata(self.extension_type_metadata())
+ .and_then(|metadata| E::try_new(self.data_type(),
metadata))
+ }
+ // Name mismatch
+ Some(name) => Err(ArrowError::InvalidArgumentError(format!(
+ "Field extension type name mismatch, expected {}, found
{name}",
+ E::NAME
+ ))),
+ // Name missing
+ None => Err(ArrowError::InvalidArgumentError(
+ "Field extension type name missing".to_owned(),
+ )),
+ }
+ }
+
+ /// Returns an instance of the given [`ExtensionType`] of this [`Field`],
+ /// panics if this [`Field`] does not have this extension type.
+ ///
+ /// # Panic
+ ///
+ /// This calls [`Field::try_extension_type`] and panics when it returns an
+ /// error.
+ pub fn extension_type<E: ExtensionType>(&self) -> E {
+ self.try_extension_type::<E>()
+ .unwrap_or_else(|e| panic!("{e}"))
+ }
+
+ /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
+ /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the
+ /// given extension type supports the [`Field::data_type`] of this field
+ /// ([`ExtensionType::supports_data_type`]).
+ ///
+ /// If the given extension type defines no metadata, a previously set
+ /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared.
+ ///
+ /// # Error
+ ///
+ /// This functions returns an error if the data type of this field does not
+ /// match any of the supported storage types of the given extension type.
+ pub fn try_with_extension_type<E: ExtensionType>(
+ &mut self,
+ extension_type: E,
+ ) -> Result<(), ArrowError> {
+ // Make sure the data type of this field is supported
+ extension_type.supports_data_type(&self.data_type)?;
+
+ self.metadata
+ .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned());
+ match extension_type.serialize_metadata() {
+ Some(metadata) => self
+ .metadata
+ .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata),
+ // If this extension type has no metadata, we make sure to
+ // clear previously set metadata.
+ None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY),
+ };
+
+ Ok(())
+ }
+
+ /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
+ /// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
+ ///
+ /// # Panics
+ ///
+ /// This calls [`Field::try_with_extension_type`] and panics when it
+ /// returns an error.
+ pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E)
-> Self {
+ self.try_with_extension_type(extension_type)
+ .unwrap_or_else(|e| panic!("{e}"));
+ self
+ }
+
+ /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set.
+ ///
+ /// # Error
+ ///
+ /// Returns an error if
+ /// - this field does have a canonical extension type (mismatch or missing)
+ /// - the canonical extension is not supported
+ /// - the construction of the extension type fails
+ #[cfg(feature = "canonical_extension_types")]
+ pub fn try_canonical_extension_type(&self) ->
Result<CanonicalExtensionType, ArrowError> {
+ CanonicalExtensionType::try_from(self)
+ }
+
/// Indicates whether this [`Field`] supports null values.
#[inline]
pub const fn is_nullable(&self) -> bool {
diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs
index d06382fbcd..a83e23e275 100644
--- a/arrow-schema/src/lib.rs
+++ b/arrow-schema/src/lib.rs
@@ -25,6 +25,7 @@ use std::fmt::Display;
mod datatype_parse;
mod error;
pub use error::*;
+pub mod extension;
mod field;
pub use field::*;
mod fields;
diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs
index 2a532600b6..c363b99920 100644
--- a/arrow-select/src/dictionary.rs
+++ b/arrow-select/src/dictionary.rs
@@ -315,7 +315,7 @@ mod tests {
assert_eq!(merged.values.as_ref(), &expected);
assert_eq!(merged.key_mappings.len(), 2);
assert_eq!(&merged.key_mappings[0], &[0, 0, 0, 1, 0]);
- assert_eq!(&merged.key_mappings[1], &[]);
+ assert_eq!(&merged.key_mappings[1], &[] as &[i32; 0]);
}
#[test]
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 1b01dcd25e..88231b7f61 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -80,6 +80,7 @@ force_validate = ["arrow-array/force_validate",
"arrow-data/force_validate"]
# Enable ffi support
ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"]
chrono-tz = ["arrow-array/chrono-tz"]
+canonical_extension_types = ["arrow-schema/canonical_extension_types"]
[dev-dependencies]
chrono = { workspace = true }
diff --git a/arrow/README.md b/arrow/README.md
index 79aefaae90..64d9eb980e 100644
--- a/arrow/README.md
+++ b/arrow/README.md
@@ -61,6 +61,7 @@ The `arrow` crate provides the following features which may
be enabled in your `
- `chrono-tz` - support of parsing timezone using
[chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/)
- `ffi` - bindings for the Arrow C [C Data
Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
- `pyarrow` - bindings for pyo3 to call arrow-rs from python
+- `canonical_extension_types` - definitions for [canonical extension
types](https://arrow.apache.org/docs/format/CanonicalExtensions.html#format-canonical-extensions)
## Arrow Feature Status
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 54992d864d..00d4c5b750 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -103,6 +103,8 @@ default = ["arrow", "snap", "brotli", "flate2", "lz4",
"zstd", "base64", "simdut
lz4 = ["lz4_flex"]
# Enable arrow reader/writer APIs
arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data",
"arrow-schema", "arrow-select", "arrow-ipc"]
+# Enable support for arrow canonical extension types
+arrow_canonical_extension_types = ["arrow-schema?/canonical_extension_types"]
# Enable CLI tools
cli = ["json", "base64", "clap", "arrow-csv", "serde"]
# Enable JSON APIs
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 8be2439002..8b3e92251b 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -23,6 +23,8 @@ use std::collections::HashMap;
use std::sync::Arc;
use arrow_ipc::writer;
+#[cfg(feature = "arrow_canonical_extension_types")]
+use arrow_schema::extension::{Json, Uuid};
use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit};
use crate::basic::{
@@ -380,12 +382,26 @@ pub fn parquet_to_arrow_field(parquet_column:
&ColumnDescriptor) -> Result<Field
let mut ret = Field::new(parquet_column.name(), field.arrow_type,
field.nullable);
let basic_info = parquet_column.self_type().get_basic_info();
+ let mut meta = HashMap::with_capacity(if cfg!(feature =
"arrow_canonical_extension_types") {
+ 2
+ } else {
+ 1
+ });
if basic_info.has_id() {
- let mut meta = HashMap::with_capacity(1);
meta.insert(
PARQUET_FIELD_ID_META_KEY.to_string(),
basic_info.id().to_string(),
);
+ }
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ if let Some(logical_type) = basic_info.logical_type() {
+ match logical_type {
+ LogicalType::Uuid => ret.try_with_extension_type(Uuid)?,
+ LogicalType::Json => ret.try_with_extension_type(Json::default())?,
+ _ => {}
+ }
+ }
+ if !meta.is_empty() {
ret.set_metadata(meta);
}
@@ -590,6 +606,16 @@ fn arrow_to_parquet_type(field: &Field, coerce_types:
bool) -> Result<Type> {
.with_repetition(repetition)
.with_id(id)
.with_length(*length)
+ .with_logical_type(
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ // If set, map arrow uuid extension type to parquet uuid
logical type.
+ field
+ .try_extension_type::<Uuid>()
+ .ok()
+ .map(|_| LogicalType::Uuid),
+ #[cfg(not(feature = "arrow_canonical_extension_types"))]
+ None,
+ )
.build()
}
DataType::BinaryView => Type::primitive_type_builder(name,
PhysicalType::BYTE_ARRAY)
@@ -623,13 +649,35 @@ fn arrow_to_parquet_type(field: &Field, coerce_types:
bool) -> Result<Type> {
}
DataType::Utf8 | DataType::LargeUtf8 => {
Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY)
- .with_logical_type(Some(LogicalType::String))
+ .with_logical_type({
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ {
+ // Use the Json logical type if the canonical Json
+ // extension type is set on this field.
+ field
+ .try_extension_type::<Json>()
+ .map_or(Some(LogicalType::String), |_|
Some(LogicalType::Json))
+ }
+ #[cfg(not(feature = "arrow_canonical_extension_types"))]
+ Some(LogicalType::String)
+ })
.with_repetition(repetition)
.with_id(id)
.build()
}
DataType::Utf8View => Type::primitive_type_builder(name,
PhysicalType::BYTE_ARRAY)
- .with_logical_type(Some(LogicalType::String))
+ .with_logical_type({
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ {
+ // Use the Json logical type if the canonical Json
+ // extension type is set on this field.
+ field
+ .try_extension_type::<Json>()
+ .map_or(Some(LogicalType::String), |_|
Some(LogicalType::Json))
+ }
+ #[cfg(not(feature = "arrow_canonical_extension_types"))]
+ Some(LogicalType::String)
+ })
.with_repetition(repetition)
.with_id(id)
.build(),
@@ -2163,4 +2211,52 @@ mod tests {
fn test_get_arrow_schema_from_metadata() {
assert!(get_arrow_schema_from_metadata("").is_err());
}
+
+ #[test]
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ fn arrow_uuid_to_parquet_uuid() -> Result<()> {
+ let arrow_schema = Schema::new(vec![Field::new(
+ "uuid",
+ DataType::FixedSizeBinary(16),
+ false,
+ )
+ .with_extension_type(Uuid)]);
+
+ let parquet_schema =
ArrowSchemaConverter::new().convert(&arrow_schema)?;
+
+ assert_eq!(
+ parquet_schema.column(0).logical_type(),
+ Some(LogicalType::Uuid)
+ );
+
+ // TODO: roundtrip
+ // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
+ // assert_eq!(arrow_schema.field(0).try_extension_type::<Uuid>()?,
Uuid);
+
+ Ok(())
+ }
+
+ #[test]
+ #[cfg(feature = "arrow_canonical_extension_types")]
+ fn arrow_json_to_parquet_json() -> Result<()> {
+ let arrow_schema = Schema::new(vec![
+ Field::new("json", DataType::Utf8,
false).with_extension_type(Json::default())
+ ]);
+
+ let parquet_schema =
ArrowSchemaConverter::new().convert(&arrow_schema)?;
+
+ assert_eq!(
+ parquet_schema.column(0).logical_type(),
+ Some(LogicalType::Json)
+ );
+
+ // TODO: roundtrip
+ // let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
+ // assert_eq!(
+ // arrow_schema.field(0).try_extension_type::<Json>()?,
+ // Json::default()
+ // );
+
+ Ok(())
+ }
}