This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d9bb013ba1 Adds ExtensionType for Parquet geospatial WKB arrays (#8943)
d9bb013ba1 is described below
commit d9bb013ba163e460d5f50781be58d418bde24ca0
Author: Blake Orth <[email protected]>
AuthorDate: Mon Dec 15 11:11:50 2025 -0700
Adds ExtensionType for Parquet geospatial WKB arrays (#8943)
# Which issue does this PR close?
This does not fully close, but is an initial component of:
- https://github.com/apache/arrow-rs/issues/8717
# Rationale for this change
To keep PR size digestible, this implements the bi-directional extension
type metadata parsing for Parquet geospatial arrays. An actual array
type that allows users to easily interact with the data can come as a
follow-on PR.
# What changes are included in this PR?
- Implements the ExtensionType for Parquet geospatial logical types
- Adds GeoArrow compatible Metadata type to hold geospatial metadata
- Adds basic tests around geospatial metadata
serialization/deserialization
- Integrates geospatial logical type parsing into schema extensions
# Are these changes tested?
Yes. Targeted unit tests for the metadata and associated parsing have
been implemented. Higher level tests that show a full round-trip to and
from arrays can be included in the array implementation PR.
# Are there any user-facing changes?
Yes. All new public items have been documented, and no API breaking
changes have been made.
##
cc @alamb @paleolimbot @kylebarron
---
parquet-geospatial/Cargo.toml | 2 +
parquet-geospatial/src/lib.rs | 7 +
parquet-geospatial/src/types.rs | 407 ++++++++++++++++++++++++++++++++++
parquet/src/arrow/schema/extension.rs | 52 +++++
parquet/src/arrow/schema/mod.rs | 7 +-
parquet/src/basic.rs | 82 +++++++
parquet/tests/geospatial.rs | 263 +++++++++++-----------
7 files changed, 689 insertions(+), 131 deletions(-)
diff --git a/parquet-geospatial/Cargo.toml b/parquet-geospatial/Cargo.toml
index 36c12edf0b..471b355dc6 100644
--- a/parquet-geospatial/Cargo.toml
+++ b/parquet-geospatial/Cargo.toml
@@ -31,6 +31,8 @@ rust-version = { workspace = true }
[dependencies]
arrow-schema = { workspace = true }
geo-traits = { version = "0.3" }
+serde = { version = "1.0", default-features = false, features = ["derive"]}
+serde_json = { version = "1.0", default-features = false, features = ["std"]}
wkb = { version = "0.9.1" }
[dev-dependencies]
diff --git a/parquet-geospatial/src/lib.rs b/parquet-geospatial/src/lib.rs
index 2ec60c44cc..7b2b6166a4 100644
--- a/parquet-geospatial/src/lib.rs
+++ b/parquet-geospatial/src/lib.rs
@@ -30,3 +30,10 @@
pub mod bounding;
pub mod interval;
pub mod testing;
+
+mod types;
+
+pub use types::Edges as WkbEdges;
+pub use types::Hint as WkbTypeHint;
+pub use types::Metadata as WkbMetadata;
+pub use types::WkbType;
diff --git a/parquet-geospatial/src/types.rs b/parquet-geospatial/src/types.rs
new file mode 100644
index 0000000000..f19911ad05
--- /dev/null
+++ b/parquet-geospatial/src/types.rs
@@ -0,0 +1,407 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::{ArrowError, DataType, extension::ExtensionType};
+use serde::{Deserialize, Serialize};
+
+/// Hints at the likely Parquet geospatial logical type represented by a
[`Metadata`].
+///
+/// Based on the `algorithm` field:
+/// - [`Hint::Geometry`]: WKB format with linear/planar edge interpolation
+/// - [`Hint::Geography`]: WKB format with explicit non-linear/non-planar edge
interpolation
+///
+/// See the [Parquet Geospatial
specification](https://github.com/apache/parquet-format/blob/master/Geospatial.md)
+/// for more details.
+#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
+pub enum Hint {
+ /// Geospatial features in WKB format with linear/planar edge interpolation
+ Geometry,
+ /// Geospatial features in WKB format with explicit non-linear/non-planar
edge interpolation
+ Geography,
+}
+
+/// The edge interpolation algorithms used with `GEOMETRY` logical types.
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum Edges {
+ /// Edges are interpolated as geodesics on a sphere.
+ #[default]
+ Spherical,
+ /// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
+ Vincenty,
+ /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local
geometry. US Naval Oceanographic Office, 1970
+ Thomas,
+ /// Thomas, Paul D. Mathematical models for navigation systems. US Naval
Oceanographic Office, 1965.
+ Andoyer,
+ /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87
(2013): 43-55
+ Karney,
+}
+
+/// The metadata associated with a [`WkbType`].
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct Metadata {
+ /// The Coordinate Reference System (CRS) of the [`WkbType`], if present.
+ ///
+ /// This may be a raw string value (e.g., "EPSG:3857") or a JSON object
(e.g., PROJJSON).
+ /// Note: Common lon/lat CRS representations (EPSG:4326, OGC:CRS84) are
canonicalized
+ /// to `None` during serialization to match Parquet conventions.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub crs: Option<serde_json::Value>,
+ /// The edge interpolation algorithm of the [`WkbType`], if present.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub algorithm: Option<Edges>,
+}
+
+impl Metadata {
+ /// Constructs a new [`Metadata`] with the given CRS and algorithm.
+ ///
+ /// If a CRS is provided, and can be parsed as JSON, it will be stored as
a JSON object instead
+ /// of its string representation.
+ pub fn new(crs: Option<&str>, algorithm: Option<Edges>) -> Self {
+ let crs = crs.map(|c| match serde_json::from_str(c) {
+ Ok(crs) => crs,
+ Err(_) => serde_json::Value::String(c.to_string()),
+ });
+
+ Self { crs, algorithm }
+ }
+
+ /// Returns a [`Hint`] to the likely underlying Logical Type that this
[`Metadata`] represents.
+ pub fn type_hint(&self) -> Hint {
+ match &self.algorithm {
+ Some(_) => Hint::Geography,
+ None => Hint::Geometry,
+ }
+ }
+
+ /// Detect if the CRS is a common representation of lon/lat on the
standard WGS84 ellipsoid
+ fn crs_is_lon_lat(&self) -> bool {
+ use serde_json::Value;
+
+ let Some(crs) = &self.crs else {
+ return false;
+ };
+
+ match crs {
+ Value::String(s) if s == "EPSG:4326" || s == "OGC:CRS84" => true,
+ Value::Object(_) => match (&crs["id"]["authority"],
&crs["id"]["code"]) {
+ (Value::String(auth), Value::String(code)) if auth == "OGC" &&
code == "CRS84" => {
+ true
+ }
+ (Value::String(auth), Value::String(code)) if auth == "EPSG"
&& code == "4326" => {
+ true
+ }
+ (Value::String(auth), Value::Number(code))
+ if auth == "EPSG" && code.as_i64() == Some(4326) =>
+ {
+ true
+ }
+ _ => false,
+ },
+ _ => false,
+ }
+ }
+}
+
+/// Well-Known Binary (WKB) [`ExtensionType`] for geospatial data.
+///
+/// Represents the canonical Arrow Extension Type for storing
+/// [GeoArrow](https://github.com/geoarrow/geoarrow) data.
+#[derive(Debug, Default)]
+pub struct WkbType(Metadata);
+
+impl WkbType {
+ /// Constructs a new [`WkbType`] with the given [`Metadata`].
+ ///
+ /// If `None` is provided, default (empty) metadata is used.
+ pub fn new(metadata: Option<Metadata>) -> Self {
+ Self(metadata.unwrap_or_default())
+ }
+}
+
+type ArrowResult<T> = Result<T, ArrowError>;
+impl ExtensionType for WkbType {
+ const NAME: &'static str = "geoarrow.wkb";
+
+ type Metadata = Metadata;
+
+ fn metadata(&self) -> &Self::Metadata {
+ &self.0
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ let md = if self.0.crs_is_lon_lat() {
+ &Metadata {
+ crs: None, // lon/lat CRS is canonicalized as omitted (None)
for Parquet
+ algorithm: self.0.algorithm,
+ }
+ } else {
+ &self.0
+ };
+
+ serde_json::to_string(md).ok()
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) ->
ArrowResult<Self::Metadata> {
+ let Some(metadata) = metadata else {
+ return Ok(Self::Metadata::default());
+ };
+
+ serde_json::from_str(metadata).map_err(|e|
ArrowError::JsonError(e.to_string()))
+ }
+
+ fn supports_data_type(&self, data_type: &arrow_schema::DataType) ->
ArrowResult<()> {
+ match data_type {
+ DataType::Binary | DataType::LargeBinary | DataType::BinaryView =>
Ok(()),
+ dt => Err(ArrowError::InvalidArgumentError(format!(
+ "Geometry data type mismatch, expected one of Binary,
LargeBinary, BinaryView. Found {dt}"
+ ))),
+ }
+ }
+
+ fn try_new(data_type: &arrow_schema::DataType, metadata: Self::Metadata)
-> ArrowResult<Self> {
+ let wkb = Self(metadata);
+ wkb.supports_data_type(data_type)?;
+ Ok(wkb)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow_schema::Field;
+
+ /// Test metadata serialization and deserialization with empty/default
metadata
+ #[test]
+ fn test_metadata_empty_roundtrip() -> ArrowResult<()> {
+ let metadata = Metadata::default();
+ let wkb = WkbType::new(Some(metadata));
+
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
+ assert!(deserialized.crs.is_none());
+ assert!(deserialized.algorithm.is_none());
+
+ Ok(())
+ }
+
+ /// Test metadata serialization with CRS as a simple string
+ #[test]
+ fn test_metadata_crs_string_roundtrip() -> ArrowResult<()> {
+ let metadata = Metadata::new(Some("srid:1234"), None);
+ let wkb = WkbType::new(Some(metadata));
+
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
+
+ let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
+ assert_eq!(
+ deserialized.crs.unwrap(),
+ serde_json::Value::String(String::from("srid:1234"))
+ );
+ assert!(deserialized.algorithm.is_none());
+
+ Ok(())
+ }
+
+ /// Test metadata serialization with CRS as a JSON object
+ #[test]
+ fn test_metadata_crs_json_object_roundtrip() -> ArrowResult<()> {
+ let crs_json =
r#"{"type":"custom_json","properties":{"name":"EPSG:4326"}}"#;
+ let metadata = Metadata::new(Some(crs_json), None);
+ let wkb = WkbType::new(Some(metadata));
+
+ let serialized = wkb.serialize_metadata().unwrap();
+ // Validate by parsing the JSON and checking structure (field order is
not guaranteed)
+ let parsed: serde_json::Value =
serde_json::from_str(&serialized).unwrap();
+ assert_eq!(parsed["crs"]["type"], "custom_json");
+ assert_eq!(parsed["crs"]["properties"]["name"], "EPSG:4326");
+
+ let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
+
+ // Verify it's a JSON object with expected structure
+ let crs = deserialized.crs.unwrap();
+ assert!(crs.is_object());
+ assert_eq!(crs["type"], "custom_json");
+ assert_eq!(crs["properties"]["name"], "EPSG:4326");
+
+ Ok(())
+ }
+
+ /// Test metadata serialization with algorithm field
+ #[test]
+ fn test_metadata_algorithm_roundtrip() -> ArrowResult<()> {
+ let metadata = Metadata::new(None, Some(Edges::Spherical));
+ let wkb = WkbType::new(Some(metadata));
+
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+
+ let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
+ assert!(deserialized.crs.is_none());
+ assert_eq!(deserialized.algorithm, Some(Edges::Spherical));
+
+ Ok(())
+ }
+
+ /// Test metadata serialization with both CRS and algorithm
+ #[test]
+ fn test_metadata_full_roundtrip() -> ArrowResult<()> {
+ let metadata = Metadata::new(Some("srid:1234"),
Some(Edges::Spherical));
+ let wkb = WkbType::new(Some(metadata));
+
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized,
r#"{"crs":"srid:1234","algorithm":"spherical"}"#);
+
+ let deserialized = WkbType::deserialize_metadata(Some(&serialized))?;
+ assert_eq!(
+ deserialized.crs.unwrap(),
+ serde_json::Value::String("srid:1234".to_string())
+ );
+ assert_eq!(deserialized.algorithm, Some(Edges::Spherical));
+
+ Ok(())
+ }
+
+ /// Test deserialization of None metadata
+ #[test]
+ fn test_metadata_deserialize_none() -> ArrowResult<()> {
+ let deserialized = WkbType::deserialize_metadata(None)?;
+ assert!(deserialized.crs.is_none());
+ assert!(deserialized.algorithm.is_none());
+ Ok(())
+ }
+
+ /// Test deserialization of invalid JSON
+ #[test]
+ fn test_metadata_deserialize_invalid_json() {
+ let result = WkbType::deserialize_metadata(Some("not valid json {"));
+ assert!(matches!(result, Err(ArrowError::JsonError(_))));
+ }
+
+ /// Test metadata that results in a Geometry type hint
+ #[test]
+ fn test_type_hint_geometry() {
+ let metadata = Metadata::new(None, None);
+ assert!(matches!(metadata.type_hint(), Hint::Geometry));
+ }
+
+ /// Test metadata that results in a Geography type hint
+ #[test]
+ fn test_type_hint_edges_is_geography() {
+ let algorithms = vec![
+ Edges::Spherical,
+ Edges::Vincenty,
+ Edges::Thomas,
+ Edges::Andoyer,
+ Edges::Karney,
+ ];
+ for algo in algorithms {
+ let metadata = Metadata::new(None, Some(algo));
+ assert!(matches!(metadata.type_hint(), Hint::Geography));
+ }
+ }
+
+ /// Test extension type integration using a Field
+ #[test]
+ fn test_extension_type_with_field() -> ArrowResult<()> {
+ let metadata = Metadata::new(Some("srid:1234"), None);
+ let wkb_type = WkbType::new(Some(metadata));
+
+ let mut field = Field::new("geometry", DataType::Binary, false);
+ field.try_with_extension_type(wkb_type)?;
+
+ // Verify we can extract the extension type back
+ let extracted = field.try_extension_type::<WkbType>()?;
+ assert_eq!(
+ extracted.metadata().crs.as_ref().unwrap(),
+ &serde_json::Value::String(String::from("srid:1234"))
+ );
+
+ Ok(())
+ }
+
+ /// Test extension type DataType support
+ #[test]
+ fn test_extension_type_support() -> ArrowResult<()> {
+ let wkb = WkbType::default();
+ // supported types
+ wkb.supports_data_type(&DataType::Binary)?;
+ wkb.supports_data_type(&DataType::LargeBinary)?;
+ wkb.supports_data_type(&DataType::BinaryView)?;
+
+ // reject unsupported types with an error
+ let result = wkb.supports_data_type(&DataType::Utf8);
+ assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
+
+ Ok(())
+ }
+
+ /// Test CRS canonicalization logic for common lon/lat representations
+ #[test]
+ fn test_crs_canonicalization() -> ArrowResult<()> {
+ // EPSG:4326 as string should be omitted
+ let metadata = Metadata::new(Some("EPSG:4326"), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ // OGC:CRS84 as string should be omitted
+ let metadata = Metadata::new(Some("OGC:CRS84"), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ // A JSON object that reasonably looks like PROJJSON for EPSG:4326
should be omitted
+ // detect "4326" as a string
+ let crs_json = r#"{"id":{"authority":"EPSG","code":"4326"}}"#;
+ let metadata = Metadata::new(Some(crs_json), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ // detect 4326 as a number
+ let crs_json = r#"{"id":{"authority":"EPSG","code":4326}}"#;
+ let metadata = Metadata::new(Some(crs_json), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ // A JSON object that reasonably looks like PROJJSON for OGC:CRS84
should be omitted
+ let crs_json = r#"{"id":{"authority":"OGC","code":"CRS84"}}"#;
+ let metadata = Metadata::new(Some(crs_json), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, "{}");
+
+ // Other input types should be preserved
+ let metadata = Metadata::new(Some("srid:1234"), None);
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, r#"{"crs":"srid:1234"}"#);
+
+ // Canonicalization should work with algorithm field
+ let metadata = Metadata::new(Some("EPSG:4326"),
Some(Edges::Spherical));
+ let wkb = WkbType::new(Some(metadata));
+ let serialized = wkb.serialize_metadata().unwrap();
+ assert_eq!(serialized, r#"{"algorithm":"spherical"}"#);
+
+ Ok(())
+ }
+}
diff --git a/parquet/src/arrow/schema/extension.rs
b/parquet/src/arrow/schema/extension.rs
index fe3e856a6c..2471019646 100644
--- a/parquet/src/arrow/schema/extension.rs
+++ b/parquet/src/arrow/schema/extension.rs
@@ -55,6 +55,17 @@ pub(crate) fn try_add_extension_type(
LogicalType::Json => {
arrow_field.try_with_extension_type(arrow_schema::extension::Json::default())?;
}
+ #[cfg(feature = "geospatial")]
+ LogicalType::Geometry { crs } => {
+ let md = parquet_geospatial::WkbMetadata::new(crs.as_deref(),
None);
+
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
+ }
+ #[cfg(feature = "geospatial")]
+ LogicalType::Geography { crs, algorithm } => {
+ let algorithm = algorithm.map(|a| a.try_as_edges()).transpose()?;
+ let md = parquet_geospatial::WkbMetadata::new(crs.as_deref(),
algorithm);
+
arrow_field.try_with_extension_type(parquet_geospatial::WkbType::new(Some(md)))?;
+ }
_ => {}
};
Ok(arrow_field)
@@ -75,6 +86,10 @@ pub(crate) fn has_extension_type(parquet_type: &Type) ->
bool {
LogicalType::Uuid => true,
#[cfg(feature = "arrow_canonical_extension_types")]
LogicalType::Json => true,
+ #[cfg(feature = "geospatial")]
+ LogicalType::Geometry { .. } => true,
+ #[cfg(feature = "geospatial")]
+ LogicalType::Geography { .. } => true,
_ => false,
}
}
@@ -133,3 +148,40 @@ pub(crate) fn logical_type_for_string(field: &Field) ->
Option<LogicalType> {
pub(crate) fn logical_type_for_string(_field: &Field) -> Option<LogicalType> {
Some(LogicalType::String)
}
+
+#[cfg(feature = "geospatial")]
+pub(crate) fn logical_type_for_binary(field: &Field) -> Option<LogicalType> {
+ use parquet_geospatial::WkbType;
+ use parquet_geospatial::WkbTypeHint;
+
+ match field.extension_type_name() {
+ Some(n) if n == WkbType::NAME => match
field.try_extension_type::<WkbType>() {
+ Ok(wkb_type) => match wkb_type.metadata().type_hint() {
+ WkbTypeHint::Geometry => Some(LogicalType::Geometry {
+ crs: wkb_type.metadata().crs.as_ref().map(|c|
c.to_string()),
+ }),
+ WkbTypeHint::Geography => Some(LogicalType::Geography {
+ crs: wkb_type.metadata().crs.as_ref().map(|c|
c.to_string()),
+ algorithm: wkb_type.metadata().algorithm.map(|a| a.into()),
+ }),
+ },
+ Err(_e) => None,
+ },
+ _ => None,
+ }
+}
+
+#[cfg(not(feature = "geospatial"))]
+pub(crate) fn logical_type_for_binary(field: &Field) -> Option<LogicalType> {
+ None
+}
+
+#[cfg(feature = "geospatial")]
+pub(crate) fn logical_type_for_binary_view(field: &Field) ->
Option<LogicalType> {
+ logical_type_for_binary(field)
+}
+
+#[cfg(not(feature = "geospatial"))]
+pub(crate) fn logical_type_for_binary_view(field: &Field) ->
Option<LogicalType> {
+ None
+}
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index b4a9ba7b7f..b33f9c14dd 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -40,8 +40,9 @@ pub mod virtual_type;
use super::PARQUET_FIELD_ID_META_KEY;
use crate::arrow::ProjectionMask;
use crate::arrow::schema::extension::{
- has_extension_type, logical_type_for_fixed_size_binary,
logical_type_for_string,
- logical_type_for_struct, try_add_extension_type,
+ has_extension_type, logical_type_for_binary, logical_type_for_binary_view,
+ logical_type_for_fixed_size_binary, logical_type_for_string,
logical_type_for_struct,
+ try_add_extension_type,
};
pub(crate) use complex::{ParquetField, ParquetFieldType, VirtualColumnType};
@@ -712,6 +713,7 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool)
-> Result<Type> {
Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY)
.with_repetition(repetition)
.with_id(id)
+ .with_logical_type(logical_type_for_binary(field))
.build()
}
DataType::FixedSizeBinary(length) => {
@@ -725,6 +727,7 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool)
-> Result<Type> {
DataType::BinaryView => Type::primitive_type_builder(name,
PhysicalType::BYTE_ARRAY)
.with_repetition(repetition)
.with_id(id)
+ .with_logical_type(logical_type_for_binary_view(field))
.build(),
DataType::Decimal32(precision, scale)
| DataType::Decimal64(precision, scale)
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index f06cfdb53f..ba8ffc2e92 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -1016,12 +1016,65 @@ pub enum EdgeInterpolationAlgorithm {
_Unknown(i32),
}
+#[cfg(feature = "geospatial")]
+impl EdgeInterpolationAlgorithm {
+ /// Converts an [`EdgeInterpolationAlgorithm`] into its corresponding
algorithm defined by
+ /// [`parquet_geospatial::WkbEdges`].
+ ///
+ /// This method will only return an Err if the
[`EdgeInterpolationAlgorithm`] is the `_Unknown`
+ /// variant.
+ pub fn try_as_edges(&self) -> Result<parquet_geospatial::WkbEdges> {
+ match &self {
+ Self::SPHERICAL => Ok(parquet_geospatial::WkbEdges::Spherical),
+ Self::VINCENTY => Ok(parquet_geospatial::WkbEdges::Vincenty),
+ Self::THOMAS => Ok(parquet_geospatial::WkbEdges::Thomas),
+ Self::ANDOYER => Ok(parquet_geospatial::WkbEdges::Andoyer),
+ Self::KARNEY => Ok(parquet_geospatial::WkbEdges::Karney),
+ unknown => Err(general_err!(
+ "Unknown edge interpolation algorithm: {}",
+ unknown
+ )),
+ }
+ }
+}
+
impl fmt::Display for EdgeInterpolationAlgorithm {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_fmt(format_args!("{0:?}", self))
}
}
+#[cfg(feature = "geospatial")]
+impl From<parquet_geospatial::WkbEdges> for EdgeInterpolationAlgorithm {
+ fn from(value: parquet_geospatial::WkbEdges) -> Self {
+ match value {
+ parquet_geospatial::WkbEdges::Spherical => Self::SPHERICAL,
+ parquet_geospatial::WkbEdges::Vincenty => Self::VINCENTY,
+ parquet_geospatial::WkbEdges::Thomas => Self::THOMAS,
+ parquet_geospatial::WkbEdges::Andoyer => Self::ANDOYER,
+ parquet_geospatial::WkbEdges::Karney => Self::KARNEY,
+ }
+ }
+}
+
+impl FromStr for EdgeInterpolationAlgorithm {
+ type Err = ParquetError;
+
+ fn from_str(s: &str) -> Result<Self> {
+ match s.to_ascii_uppercase().as_str() {
+ "SPHERICAL" => Ok(EdgeInterpolationAlgorithm::SPHERICAL),
+ "VINCENTY" => Ok(EdgeInterpolationAlgorithm::VINCENTY),
+ "THOMAS" => Ok(EdgeInterpolationAlgorithm::THOMAS),
+ "ANDOYER" => Ok(EdgeInterpolationAlgorithm::ANDOYER),
+ "KARNEY" => Ok(EdgeInterpolationAlgorithm::KARNEY),
+ unknown => Err(general_err!(
+ "Unknown edge interpolation algorithm: {}",
+ unknown
+ )),
+ }
+ }
+}
+
impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for
EdgeInterpolationAlgorithm {
fn read_thrift(prot: &mut R) -> Result<Self> {
let val = prot.read_i32()?;
@@ -2461,6 +2514,35 @@ mod tests {
assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY");
}
+ #[test]
+ fn test_from_str_edge_algo() {
+ assert_eq!(
+ "spHErical".parse::<EdgeInterpolationAlgorithm>().unwrap(),
+ EdgeInterpolationAlgorithm::SPHERICAL
+ );
+ assert_eq!(
+ "vinceNTY".parse::<EdgeInterpolationAlgorithm>().unwrap(),
+ EdgeInterpolationAlgorithm::VINCENTY
+ );
+ assert_eq!(
+ "tHOmas".parse::<EdgeInterpolationAlgorithm>().unwrap(),
+ EdgeInterpolationAlgorithm::THOMAS
+ );
+ assert_eq!(
+ "anDOYEr".parse::<EdgeInterpolationAlgorithm>().unwrap(),
+ EdgeInterpolationAlgorithm::ANDOYER
+ );
+ assert_eq!(
+ "kaRNey".parse::<EdgeInterpolationAlgorithm>().unwrap(),
+ EdgeInterpolationAlgorithm::KARNEY
+ );
+ assert!(
+ "does not exist"
+ .parse::<EdgeInterpolationAlgorithm>()
+ .is_err()
+ );
+ }
+
fn encodings_roundtrip(mut encodings: Vec<Encoding>) {
encodings.sort();
let mask = EncodingMask::new_from_encodings(encodings.iter());
diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs
index b8c623a2a8..4f449df920 100644
--- a/parquet/tests/geospatial.rs
+++ b/parquet/tests/geospatial.rs
@@ -15,133 +15,145 @@
// specific language governing permissions and limitations
// under the License.
-//! Tests for Geometry and Geography logical types
-
-use parquet::{
- basic::{EdgeInterpolationAlgorithm, LogicalType},
- file::{
- metadata::ParquetMetaData,
- reader::{FileReader, SerializedFileReader},
- },
- geospatial::bounding_box::BoundingBox,
-};
-use serde_json::Value;
-use std::fs::File;
-
-fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData {
- let path = format!(
- "{}/geospatial/{geospatial_test_file}",
- arrow::util::test_util::parquet_test_data(),
- );
- let file = File::open(path).unwrap();
- let reader = SerializedFileReader::try_from(file).unwrap();
- reader.metadata().clone()
-}
-
-#[test]
-fn test_read_logical_type() {
- // Some crs values are short strings
- let expected_logical_type = [
- ("crs-default.parquet", LogicalType::Geometry { crs: None }),
- (
- "crs-srid.parquet",
- LogicalType::Geometry {
- crs: Some("srid:5070".to_string()),
- },
- ),
- (
- "crs-projjson.parquet",
- LogicalType::Geometry {
- crs: Some("projjson:projjson_epsg_5070".to_string()),
- },
- ),
- (
- "crs-geography.parquet",
- LogicalType::Geography {
- crs: None,
- algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
- },
- ),
- ];
-
- for (geospatial_file, expected_type) in expected_logical_type {
- let metadata = read_metadata(geospatial_file);
- let column_descr = metadata.file_metadata().schema_descr().column(1);
- let logical_type = column_descr.logical_type_ref().unwrap();
-
- assert_eq!(logical_type, &expected_type);
- }
-
- // The crs value may also contain arbitrary values (in this case some JSON
- // a bit too lengthy to type out)
- let metadata = read_metadata("crs-arbitrary-value.parquet");
- let column_descr = metadata.file_metadata().schema_descr().column(1);
- let logical_type = column_descr.logical_type_ref().unwrap();
-
- if let LogicalType::Geometry { crs } = logical_type {
- let crs = crs.as_ref();
- let crs_parsed: Value = serde_json::from_str(crs.unwrap()).unwrap();
- assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070);
- } else {
- panic!("Expected geometry type but got {logical_type:?}");
- }
-}
-
-#[test]
-fn test_read_geospatial_statistics() {
- let metadata = read_metadata("geospatial.parquet");
-
- // geospatial.parquet schema:
- // optional binary field_id=-1 group (String);
- // optional binary field_id=-1 wkt (String);
- // optional binary field_id=-1 geometry (Geometry(crs=));
- let fields = metadata.file_metadata().schema().get_fields();
- let logical_type = fields[2].get_basic_info().logical_type_ref().unwrap();
- assert_eq!(logical_type, &LogicalType::Geometry { crs: None });
-
- let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
- assert!(geo_statistics.is_some());
-
- let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
- .with_zrange(30.0, 80.0)
- .with_mrange(200.0, 1600.0);
- let expected_geospatial_types = vec![
- 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001,
2002, 2003, 2004,
- 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
- ];
- assert_eq!(
- geo_statistics.unwrap().geospatial_types(),
- Some(&expected_geospatial_types)
- );
- assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox));
-}
-
#[cfg(all(feature = "arrow", feature = "geospatial"))]
mod test {
//! Tests for Geometry and Geography logical types that require the arrow
//! and/or geospatial features enabled
- use super::*;
-
- use std::{iter::zip, sync::Arc};
+ use std::{fs::File, iter::zip, sync::Arc};
use arrow_array::{ArrayRef, BinaryArray, RecordBatch, create_array};
- use arrow_schema::{DataType, Field, Schema};
+ use arrow_schema::{DataType, Field, Schema, SchemaRef,
extension::ExtensionType as _};
use bytes::Bytes;
use parquet::{
- arrow::{ArrowWriter, arrow_writer::ArrowWriterOptions},
+ arrow::{
+ ArrowSchemaConverter, ArrowWriter,
arrow_reader::ParquetRecordBatchReaderBuilder,
+ arrow_writer::ArrowWriterOptions,
+ },
+ basic::{EdgeInterpolationAlgorithm, LogicalType},
column::reader::ColumnReader,
data_type::{ByteArray, ByteArrayType},
file::{
- metadata::RowGroupMetaData,
+ metadata::{ParquetMetaData, RowGroupMetaData},
properties::{EnabledStatistics, WriterProperties},
- reader::FileReader,
+ reader::{FileReader, SerializedFileReader},
writer::SerializedFileWriter,
},
- geospatial::statistics::GeospatialStatistics,
- schema::types::{SchemaDescriptor, Type},
+ geospatial::{bounding_box::BoundingBox,
statistics::GeospatialStatistics},
+ schema::types::SchemaDescriptor,
};
- use parquet_geospatial::testing::wkb_point_xy;
+ use parquet_geospatial::{WkbEdges, WkbMetadata, WkbType,
testing::wkb_point_xy};
+ use serde_json::Value;
+
+ fn read_metadata(geospatial_test_file: &str) -> (Arc<ParquetMetaData>,
SchemaRef) {
+ let path = format!(
+ "{}/geospatial/{geospatial_test_file}",
+ arrow::util::test_util::parquet_test_data(),
+ );
+ let file = File::open(path).unwrap();
+ let reader = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+
+ (reader.metadata().clone(), reader.schema().clone())
+ }
+
+ #[test]
+ fn test_read_logical_type() {
+ // Some crs values are short strings
+ let expected_metadata = [
+ (
+ "crs-default.parquet",
+ LogicalType::Geometry { crs: None },
+ WkbMetadata::new(None, None),
+ ),
+ (
+ "crs-srid.parquet",
+ LogicalType::Geometry {
+ crs: Some("srid:5070".to_string()),
+ },
+ WkbMetadata::new(Some("srid:5070"), None),
+ ),
+ (
+ "crs-projjson.parquet",
+ LogicalType::Geometry {
+ crs: Some("projjson:projjson_epsg_5070".to_string()),
+ },
+ WkbMetadata::new(Some("projjson:projjson_epsg_5070"), None),
+ ),
+ (
+ "crs-geography.parquet",
+ LogicalType::Geography {
+ crs: None,
+ algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
+ },
+ WkbMetadata::new(None, Some(WkbEdges::Spherical)),
+ ),
+ ];
+
+ for (geospatial_file, expected_type, expected_field_meta) in
expected_metadata {
+ let (metadata, schema) = read_metadata(geospatial_file);
+ let column_descr =
metadata.file_metadata().schema_descr().column(1);
+ let logical_type = column_descr.logical_type_ref().unwrap();
+
+ assert_eq!(logical_type, &expected_type);
+
+ let field = schema.field(1);
+ let wkb_type = field.try_extension_type::<WkbType>().unwrap();
+
+ assert_eq!(wkb_type.metadata().crs, expected_field_meta.crs);
+ assert_eq!(wkb_type.metadata().algorithm,
expected_field_meta.algorithm);
+ }
+
+ // The crs value may also contain arbitrary values (in this case some
JSON
+ // a bit too lengthy to type out)
+ let (metadata, schema) = read_metadata("crs-arbitrary-value.parquet");
+ let column_descr = metadata.file_metadata().schema_descr().column(1);
+ let logical_type = column_descr.logical_type_ref().unwrap();
+
+ if let LogicalType::Geometry { crs } = logical_type {
+ let crs = crs.as_ref();
+ let crs_parsed: Value =
serde_json::from_str(crs.unwrap()).unwrap();
+ assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(),
5070);
+ } else {
+ panic!("Expected geometry type but got {logical_type:?}");
+ }
+
+ let field = schema.field(1);
+ let wkb_type = field.try_extension_type::<WkbType>().unwrap();
+ assert_eq!(
+ wkb_type.metadata().crs.as_ref().unwrap()["id"]["code"],
+ 5070
+ );
+ assert_eq!(wkb_type.metadata().algorithm, None);
+ }
+
+ #[test]
+ fn test_read_geospatial_statistics() {
+ let (metadata, _) = read_metadata("geospatial.parquet");
+
+ // geospatial.parquet schema:
+ // optional binary field_id=-1 group (String);
+ // optional binary field_id=-1 wkt (String);
+ // optional binary field_id=-1 geometry (Geometry(crs=));
+ let fields = metadata.file_metadata().schema().get_fields();
+ let logical_type =
fields[2].get_basic_info().logical_type_ref().unwrap();
+ assert_eq!(logical_type, &LogicalType::Geometry { crs: None });
+
+ let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
+ assert!(geo_statistics.is_some());
+
+ let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
+ .with_zrange(30.0, 80.0)
+ .with_mrange(200.0, 1600.0);
+ let expected_geospatial_types = vec![
+ 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
+ ];
+ assert_eq!(
+ geo_statistics.unwrap().geospatial_types(),
+ Some(&expected_geospatial_types)
+ );
+ assert_eq!(geo_statistics.unwrap().bounding_box(),
Some(&expected_bbox));
+ }
fn read_row_group_metadata(b: Bytes) -> Vec<RowGroupMetaData> {
let reader = SerializedFileReader::new(b).unwrap();
@@ -177,8 +189,7 @@ mod test {
None,
];
- let root = parquet_schema_geometry();
- let schema = SchemaDescriptor::new(root.into());
+ let schema = parquet_schema_geometry();
let props = WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::Chunk)
.build();
@@ -267,9 +278,7 @@ mod test {
None,
];
- let root = parquet_schema_geometry();
- let schema = SchemaDescriptor::new(root.into());
-
+ let schema = parquet_schema_geometry();
let props = WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::Chunk)
.build();
@@ -360,8 +369,7 @@ mod test {
let mut values = Vec::new();
let mut def_levels = Vec::new();
- let root = parquet_schema_geometry();
- let schema = SchemaDescriptor::new(root.into());
+ let schema = parquet_schema_geometry();
let props = WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::Chunk)
.build();
@@ -406,17 +414,14 @@ mod test {
}
}
- fn parquet_schema_geometry() -> Type {
- Type::group_type_builder("root")
- .with_fields(vec![
- Type::primitive_type_builder("geo",
parquet::basic::Type::BYTE_ARRAY)
- .with_logical_type(Some(LogicalType::Geometry { crs: None
}))
- .build()
- .unwrap()
- .into(),
- ])
- .build()
- .unwrap()
+ fn parquet_schema_geometry() -> SchemaDescriptor {
+ let wkb_meta = WkbMetadata::new(None, None);
+ let wkb_type = WkbType::new(Some(wkb_meta));
+
+ let field = Field::new("geo", DataType::Binary,
true).with_extension_type(wkb_type);
+ let schema = Schema::new(vec![field]);
+
+ ArrowSchemaConverter::new().convert(&schema).unwrap()
}
fn wkb_array_xy(coords: impl IntoIterator<Item = Option<(f64, f64)>>) ->
ArrayRef {