paleolimbot commented on code in PR #171:
URL: https://github.com/apache/sedona-db/pull/171#discussion_r2404810052


##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+    buf: &'a [u8],
+    geometry_type_id: Option<GeometryTypeId>,
+    dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+    /// Creates a new [WkbHeader] from a buffer
+    pub fn new(buf: &'a [u8]) -> Result<Self> {
+        Ok(Self {
+            buf,
+            geometry_type_id: None,
+            dimensions: None,
+        })
+    }
+
+    /// Returns the geometry type id of the WKB by only parsing the header 
instead of the entire WKB
+    /// 1 -> Point
+    /// 2 -> LineString
+    /// 3 -> Polygon
+    /// 4 -> MultiPoint
+    /// 5 -> MultiLineString
+    /// 6 -> MultiPolygon
+    /// 7 -> GeometryCollection
+    ///
+    /// Spec: https://libgeos.org/specifications/wkb/
+    pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+        if self.geometry_type_id.is_none() {
+            self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+        }
+        self.geometry_type_id.ok_or_else(|| {
+            DataFusionError::External("Unexpected internal state in 
WkbHeader".into())
+        })
+    }
+
+    /// Returns the dimension of the WKB by only parsing what's minimally 
necessary instead of the entire WKB
+    pub fn dimension(mut self) -> Result<Dimensions> {
+        // Calculate the dimension if we haven't already
+        if self.dimensions.is_none() {
+            self.dimensions = Some(parse_dimension(self.buf)?);
+        }
+        self.dimensions.ok_or_else(|| {
+            DataFusionError::External("Unexpected internal state in 
WkbHeader".into())
+        })
+    }
+}
+
+fn parse_dimension(buf: &[u8]) -> Result<Dimensions> {
+    if buf.len() < 5 {
+        return sedona_internal_err!("Invalid WKB: buffer too small ({} 
bytes)", buf.len());
+    }
+
+    let byte_order = buf[0];
+
+    let code = match byte_order {
+        0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        other => return sedona_internal_err!("Unexpected byte order: {other}"),
+    };
+
+    // 0000 -> xy or unspecified
+    // 1000 -> xyz
+    // 2000 -> xym
+    // 3000 -> xyzm
+    match code / 1000 {
+        // If xy, it's possible we need to infer the dimension
+        0 => {}
+        1 => return Ok(Dimensions::Xyz),
+        2 => return Ok(Dimensions::Xym),
+        3 => return Ok(Dimensions::Xyzm),
+        _ => return sedona_internal_err!("Unexpected code: {code}"),
+    };
+
+    // Try to infer dimension
+    // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code 
4-7), we need to check the dimension of the first geometry
+    if code & 0x7 >= 4 {
+        // The next 4 bytes are the number of geometries in the collection
+        let num_geometries = match byte_order {
+            0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]),
+            1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]),
+            other => return sedona_internal_err!("Unexpected byte order: 
{other}"),
+        };
+        // Check the dimension of the first geometry since they all have to be 
the same dimension
+        // Note: Attempting to create the following geometries error and are 
thus not possible to create:
+        // - Nested geometry dimension doesn't match the **specified** geom 
collection z-dimension
+        //   - GEOMETRYCOLLECTION M (POINT Z (1 1 1))
+        // - Nested geometry doesn't have the specified dimension
+        //   - GEOMETRYCOLLECTION Z (POINT (1 1))
+        // - Nested geometries have different dimensions
+        //   - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1))

Review Comment:
   Got it! How about two methods:
   
   - `dimensions(&self)` (top-level dimensions as declared by WKB)
   - `first_coord_dimensions(&self)`
   
   Which one of those you want mostly depends why you're asking if a geometry 
has a Z component or what previous information you have (our implementation of 
st_z, for example, would have no need for the second version).
   
   These are also *both* approximations...there's nothing stopping somebody 
from putting a Z value in in the *second* collection item (does PostGIS only 
check the first one?). Since neither are truly correct I don't think the 
`WkbHeader` should take sides...just provide information. If somebody really 
does need to wrangle badly written data from SQL there are other tools at their 
disposal (`st_dump()` maybe)...if a particular algorithm must know if if there 
are Z values, it should probably check the entire collection.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to