paleolimbot commented on code in PR #8801:
URL: https://github.com/apache/arrow-rs/pull/8801#discussion_r2501504161


##########
parquet-geospatial/src/crs.rs:
##########
@@ -0,0 +1,97 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow_schema::{Schema, SchemaBuilder};
+use serde_json::{Value, json};
+
+#[derive(Debug)]
+pub enum Crs {
+    Projjson(serde_json::Value),
+    Srid(u64),
+    Other(String),
+}
+
+impl Crs {
+    // TODO: make fallible
+    fn try_from_parquet_str(crs: &str, metadata: &HashMap<String, String>) -> 
Self {
+        let de: Value = serde_json::from_str(crs).unwrap();
+
+        // A CRS that does not exist or is empty defaults to 4326
+        // TODO: http link to parquet geospatial doc
+        let Some(crs) = de["crs"].as_str() else {
+            return Crs::Srid(4326);
+        };

Review Comment:
   This would be clearer as `Crs::LonLat` (neither GeoArrow nor Parquet 
specifications guarantee that an SRID is an EPSG code).



##########
parquet/src/arrow/schema/extension.rs:
##########
@@ -133,3 +146,41 @@ pub(crate) fn logical_type_for_string(field: &Field) -> 
Option<LogicalType> {
 pub(crate) fn logical_type_for_string(_field: &Field) -> Option<LogicalType> {
     Some(LogicalType::String)
 }
+
+#[cfg(feature = "geospatial")]
+pub(crate) fn logical_type_for_binary(field: &Field) -> Option<LogicalType> {
+    use parquet_geospatial::{GeographyType, GeometryType};
+
+    match field.extension_type_name() {
+        Some(n) if n == GeometryType::NAME => match 
field.try_extension_type::<GeometryType>() {
+            Ok(GeometryType) => Some(LogicalType::Geometry { crs: todo!() }),
+            Err(_e) => None,
+        },
+        Some(n) if n == GeographyType::NAME => match 
field.try_extension_type::<GeographyType>() {
+            Ok(GeographyType) => Some(LogicalType::Geography {
+                crs: todo!(),
+                algorithm: todo!(),
+            }),
+            Err(_e) => None,
+        },
+        _ => return None,
+    };
+
+    None
+}
+
+#[cfg(not(feature = "geospatial"))]
+pub(crate) fn logical_type_for_binary(field: &Field) -> Option<LogicalType> {
+    None
+}
+
+#[cfg(feature = "geospatial")]
+pub(crate) fn logical_type_for_binary_view(field: &Field) -> 
Option<LogicalType> {
+    // TODO: make this better-er
+    logical_type_for_binary(field)
+}
+
+#[cfg(not(feature = "geospatial"))]
+pub(crate) fn logical_type_for_binary_view(field: &Field) -> 
Option<LogicalType> {
+    None
+}

Review Comment:
   I'm not sure anybody is using `large_binary` on purpose, but if it's easy to 
do it may be worth supporting that here.



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -76,16 +77,18 @@ pub(crate) fn parquet_to_arrow_schema_and_fields(
     key_value_metadata: Option<&Vec<KeyValue>>,
 ) -> Result<(Schema, Option<ParquetField>)> {
     let mut metadata = 
parse_key_value_metadata(key_value_metadata).unwrap_or_default();
-    let maybe_schema = metadata
+    let mut maybe_schema = metadata
         .remove(super::ARROW_SCHEMA_META_KEY)
         .map(|value| get_arrow_schema_from_metadata(&value))
         .transpose()?;
 
     // Add the Arrow metadata to the Parquet metadata skipping keys that 
collide
-    if let Some(arrow_schema) = &maybe_schema {
+    if let Some(arrow_schema) = maybe_schema.as_mut() {
         arrow_schema.metadata().iter().for_each(|(k, v)| {
             metadata.entry(k.clone()).or_insert_with(|| v.clone());
         });
+        #[cfg(feature = "geospatial")]
+        parquet_geospatial::crs::parquet_to_arrow(arrow_schema, &metadata)

Review Comment:
   I'm not qualified to comment on the elegantness/level of hack (it doesn't 
look out of place to me, but I'm new here!). In the Arrow C++ implementation I 
had to modify the signatures of a few functions to pass the key/value metadata 
reference down into the type conversions. In C++ we have overloads so it wasn't 
a breaking change to modify the signatures and nobody objected. If it's 
reasonable to support it would be great to make it happen while we're all here 
looking at this.



##########
parquet-geospatial/src/crs.rs:
##########
@@ -0,0 +1,97 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow_schema::{Schema, SchemaBuilder};
+use serde_json::{Value, json};
+
+#[derive(Debug)]
+pub enum Crs {
+    Projjson(serde_json::Value),
+    Srid(u64),
+    Other(String),
+}
+
+impl Crs {
+    // TODO: make fallible
+    fn try_from_parquet_str(crs: &str, metadata: &HashMap<String, String>) -> 
Self {
+        let de: Value = serde_json::from_str(crs).unwrap();
+
+        // A CRS that does not exist or is empty defaults to 4326
+        // TODO: http link to parquet geospatial doc
+        let Some(crs) = de["crs"].as_str() else {
+            return Crs::Srid(4326);
+        };
+
+        if let Some(key) = crs.strip_prefix("projjson:") {
+            let Some(proj_meta) = metadata.get(key) else {
+                panic!("Failed to find key in meta: {:?}", metadata)
+            };
+
+            Self::Projjson(serde_json::from_str(proj_meta).unwrap())
+        } else if let Some(srid) = crs.strip_prefix("srid:") {
+            Self::Srid(srid.parse().unwrap())
+        } else {
+            if crs.is_empty() {
+                return Self::Srid(4326);
+            }
+
+            Self::Other(crs.to_owned())
+        }
+    }
+
+    // TODO: make fallible
+    pub(super) fn try_from_arrow_str(crs: &str) -> Self {
+        let de: Value = serde_json::from_str(crs).unwrap();
+
+        let crs = match de["crs"] {
+            Value::Null => panic!("CRS must be specified. Inputs crs {crs}"),
+            _ => &de["crs"],
+        };
+
+        match de["crs_type"].as_str() {
+            Some("projjson") => Crs::Projjson(crs.clone()),
+            Some("srid") => 
Crs::Srid(crs.as_number().unwrap().as_u64().unwrap()),
+            _ => Crs::Other(crs.to_string())

Review Comment:
   Because the `crs_type` is optional (more recently added to the GeoArrow 
spec), it's common to have the crs be PROJJSON without the `crs_type` letting 
us know it's the case. There is also a fun case where GeoPandas exported JSON 
as an escaped JSON string for a few versions (it's been fixed now but it was 
fresh when I did the Arrow C++ implementation so I handle it there).



##########
parquet-geospatial/src/types.rs:
##########
@@ -0,0 +1,87 @@
+use arrow_schema::{Schema, SchemaBuilder, extension::ExtensionType};
+
+use crate::crs::Crs;
+
+pub struct Geometry {
+    crs: Crs,
+}
+
+impl Geometry {

Review Comment:
   It will probably help you out more to have just one `struct Wkb { crs: Crs, 
edges: String }` that implements `ExtensionType`. Then your 
`try_extension_type::<Wkb>()` below will parse the edges for you and let you 
choose geometry or geography for the Parquet type.



##########
parquet-geospatial/src/crs.rs:
##########
@@ -0,0 +1,97 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow_schema::{Schema, SchemaBuilder};
+use serde_json::{Value, json};
+
+#[derive(Debug)]
+pub enum Crs {
+    Projjson(serde_json::Value),
+    Srid(u64),
+    Other(String),
+}

Review Comment:
   It may help to clarify if this is intended to be a GeoArrow CRS or a Parquet 
CRS (it seems like this is intended to match the Parquet CRS model)



##########
parquet/src/arrow/schema/extension.rs:
##########
@@ -133,3 +146,41 @@ pub(crate) fn logical_type_for_string(field: &Field) -> 
Option<LogicalType> {
 pub(crate) fn logical_type_for_string(_field: &Field) -> Option<LogicalType> {
     Some(LogicalType::String)
 }
+
+#[cfg(feature = "geospatial")]
+pub(crate) fn logical_type_for_binary(field: &Field) -> Option<LogicalType> {
+    use parquet_geospatial::{GeographyType, GeometryType};
+
+    match field.extension_type_name() {
+        Some(n) if n == GeometryType::NAME => match 
field.try_extension_type::<GeometryType>() {
+            Ok(GeometryType) => Some(LogicalType::Geometry { crs: todo!() }),
+            Err(_e) => None,
+        },
+        Some(n) if n == GeographyType::NAME => match 
field.try_extension_type::<GeographyType>() {

Review Comment:
   As I'm sure you've figured out, there's an awkward situation here where we 
have one `::NAME` (`geoarrow.wkb` and two Parquet logical types. You can figure 
this out from parsing the `"edges"` field of the JSON in the 
`"ARROW:extension:metadata"` metadata key (if it's not present or it's 
`"planar"`, it's Geometry, otherwise the algorithm name can be mapped to the 
enum in the Parquet crate).



##########
parquet-geospatial/src/crs.rs:
##########
@@ -0,0 +1,97 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow_schema::{Schema, SchemaBuilder};
+use serde_json::{Value, json};
+
+#[derive(Debug)]
+pub enum Crs {

Review Comment:
   I wonder if `struct Crs { value: String, type: Option<String> }` might help 
unify this between Parquet and GeoArrow. It's hard to abstract Crses and I'm 
not sure the `enum` here has made the conversion code more clear.



##########
parquet-geospatial/src/crs.rs:
##########
@@ -0,0 +1,97 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow_schema::{Schema, SchemaBuilder};
+use serde_json::{Value, json};
+
+#[derive(Debug)]
+pub enum Crs {
+    Projjson(serde_json::Value),
+    Srid(u64),
+    Other(String),
+}
+
+impl Crs {
+    // TODO: make fallible
+    fn try_from_parquet_str(crs: &str, metadata: &HashMap<String, String>) -> 
Self {
+        let de: Value = serde_json::from_str(crs).unwrap();
+
+        // A CRS that does not exist or is empty defaults to 4326
+        // TODO: http link to parquet geospatial doc
+        let Some(crs) = de["crs"].as_str() else {
+            return Crs::Srid(4326);
+        };
+
+        if let Some(key) = crs.strip_prefix("projjson:") {
+            let Some(proj_meta) = metadata.get(key) else {
+                panic!("Failed to find key in meta: {:?}", metadata)
+            };
+
+            Self::Projjson(serde_json::from_str(proj_meta).unwrap())
+        } else if let Some(srid) = crs.strip_prefix("srid:") {
+            Self::Srid(srid.parse().unwrap())
+        } else {

Review Comment:
   An important case to handle here is the one where the `crs` can be parsed as 
JSON (`Self::Projjson(...)`), which is how most CRSes will arrive here with 
current implementations.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to