kylebarron commented on code in PR #8414:
URL: https://github.com/apache/arrow-rs/pull/8414#discussion_r2383671532


##########
parquet-geospatial/src/bounding.rs:
##########
@@ -0,0 +1,602 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashSet;
+
+use arrow_schema::ArrowError;
+use geo_traits::{
+    CoordTrait, Dimensions, GeometryCollectionTrait, GeometryTrait, 
GeometryType, LineStringTrait,
+    MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, PointTrait, 
PolygonTrait,
+};
+use wkb::reader::Wkb;
+
+use crate::interval::{Interval, IntervalTrait, WraparoundInterval};
+
+/// Geometry bounder
+///
+/// Utility to accumulate statistics for geometries as they are written.
+/// This bounder is designed to output statistics accumulated according
+/// to the Parquet specification such that the output can be written to
+/// Parquet statistics with minimal modification.
+///
+/// See the [IntervalTrait] for an in-depth discussion of wraparound bounding
+/// (which adds some complexity to this implementation).
+#[derive(Debug)]
+pub struct GeometryBounder {
+    /// Union of all contiguous x intervals to the left of the wraparound 
midpoint
+    x_left: Interval,
+    /// Union of all contiguous x intervals that intersect the wraparound 
midpoint
+    x_mid: Interval,
+    /// Union of all contiguous x intervals to the right of the wraparound 
midpoint
+    x_right: Interval,
+    /// Union of all y intervals
+    y: Interval,
+    /// Union of all z intervals
+    z: Interval,
+    /// Union of all m intervals
+    m: Interval,
+    /// Unique geometry type codes encountered by the bounder
+    ///
+    /// The integer codes are identical to the ISO WKB geometry type codes and
+    /// are documented as part of the Parquet specification:
+    /// 
<https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types>
+    geometry_types: HashSet<i32>,
+    wraparound_hint: Interval,
+}
+
+impl GeometryBounder {
+    /// Create a new, empty bounder that represents empty input
+    pub fn empty() -> Self {
+        Self {
+            x_left: Interval::empty(),
+            x_mid: Interval::empty(),
+            x_right: Interval::empty(),
+            y: Interval::empty(),
+            z: Interval::empty(),
+            m: Interval::empty(),
+            geometry_types: HashSet::<i32>::default(),
+            wraparound_hint: Interval::empty(),
+        }
+    }
+
+    /// Set the hint to use for generation of potential wraparound xmin/xmax 
output
+    ///
+    /// Usually this value should be set to (-180, 180), as wraparound is 
primarily
+    /// targeted at lon/lat coordinate systems where collections of features 
with
+    /// components at the very far left and very far right of the coordinate 
system
+    /// are actually very close to each other.
+    ///
+    /// It is safe to set this value even when the actual coordinate system of 
the
+    /// input is unknown: if the input has coordinate values that are outside 
the
+    /// range of the wraparound hint, wraparound xmin/xmax values will not be
+    /// generated. If the input has coordinate values that are well inside of 
the
+    /// range of the wraparound hint, the wraparound xmin/xmax value will be
+    /// substantially wider than the non-wraparound version and will not be 
returned.
+    pub fn with_wraparound_hint(self, wraparound_hint: impl Into<Interval>) -> 
Self {
+        Self {
+            wraparound_hint: wraparound_hint.into(),
+            ..self
+        }
+    }
+
+    /// Calculate the final xmin and xmax for geometries encountered by this 
bounder
+    ///
+    /// The interval returned may wraparound if a hint was set and the input
+    /// encountered by this bounder were exclusively at the far left and far 
right
+    /// of the input range. See [IntervalTrait] for an in-depth description of
+    /// wraparound intervals.
+    pub fn x(&self) -> WraparoundInterval {
+        let out_all = Interval::empty()
+            .merge_interval(&self.x_left)
+            .merge_interval(&self.x_mid)
+            .merge_interval(&self.x_right);
+
+        // Check if this even makes sense: if anything is covering the midpoint
+        // of the wraparound hint or the bounds don't make sense for the 
provided
+        // wraparound hint, just return the Cartesian bounds.
+        if !self.x_mid.is_empty() || 
!self.wraparound_hint.contains_interval(&out_all) {
+            return out_all.into();
+        }
+
+        // Check if our wraparound bounds are any better than our Cartesian 
bounds
+        // If the Cartesian bounds are tighter, return them.
+        let out_width = (self.x_left.hi() - self.wraparound_hint.lo())
+            + (self.wraparound_hint.hi() - self.x_right.hi());
+        if out_all.width() < out_width {
+            return out_all.into();
+        }
+
+        // Wraparound!
+        WraparoundInterval::new(self.x_right.lo(), self.x_left.hi())
+    }
+
+    /// Calculate the final ymin and ymax for geometries encountered by this 
bounder
+    pub fn y(&self) -> Interval {
+        self.y
+    }
+
+    /// Calculate the final zmin and zmax for geometries encountered by this 
bounder
+    pub fn z(&self) -> Interval {
+        self.z
+    }
+
+    /// Calculate the final mmin and mmax values for geometries encountered by 
this bounder
+    pub fn m(&self) -> Interval {
+        self.m
+    }
+
+    /// Calculate the final geometry type set
+    ///
+    /// Returns a copy of the unique geometry type/dimension combinations 
encountered
+    /// by this bounder. These identifiers are ISO WKB identifiers (e.g., 1001
+    /// for PointZ). The output is always returned sorted.
+    pub fn geometry_types(&self) -> Vec<i32> {
+        let mut out = self.geometry_types.iter().cloned().collect::<Vec<_>>();

Review Comment:
   ```suggestion
           let mut out = self.geometry_types.iter().copied().collect();
   ```



##########
parquet-geospatial/src/bounding.rs:
##########
@@ -0,0 +1,580 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashSet;
+
+use arrow_schema::ArrowError;
+use geo_traits::{
+    CoordTrait, Dimensions, GeometryCollectionTrait, GeometryTrait, 
GeometryType, LineStringTrait,
+    MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, PointTrait, 
PolygonTrait,
+};
+use wkb::reader::Wkb;
+
+use crate::interval::{Interval, IntervalTrait, WraparoundInterval};
+
+/// Geometry bounder
+///
+/// Utility to accumulate statistics for geometries as they are written.
+/// This bounder is designed to output statistics accumulated according
+/// to the Parquet specification such that the output can be written to
+/// Parquet statistics with minimal modification.
+#[derive(Debug)]
+pub struct GeometryBounder {
+    x_left: Interval,
+    x_mid: Interval,
+    x_right: Interval,
+    y: Interval,
+    z: Interval,
+    m: Interval,
+    geometry_types: HashSet<i32>,
+    wraparound_hint: Interval,
+}
+
+impl GeometryBounder {
+    /// Create a new, empty bounder that represents empty input
+    pub fn empty() -> Self {
+        Self {
+            x_left: Interval::empty(),
+            x_mid: Interval::empty(),
+            x_right: Interval::empty(),
+            y: Interval::empty(),
+            z: Interval::empty(),
+            m: Interval::empty(),
+            geometry_types: HashSet::<i32>::default(),
+            wraparound_hint: Interval::empty(),
+        }
+    }
+
+    /// Set the hint to use for generation of potential wraparound xmin/xmax 
output
+    ///
+    /// Usually this value should be set to (-180, 180), as wraparound is 
primarily
+    /// targeted at lon/lat coordinate systems where collections of features 
with
+    /// components at the very far left and very far right of the coordinate 
system
+    /// are actually very close to each other.
+    ///
+    /// It is safe to set this value even when the actual coordinate system of 
the
+    /// input is unknown: if the input has coordinate values that are outside 
the
+    /// range of the wraparound hint, wraparound xmin/xmax values will not be
+    /// generated. If the input has coordinate values that are well inside of 
the
+    /// range of the wraparound hint, the wraparound xmin/xmax value will be
+    /// substantially wider than the non-wraparound version and will not be 
returned.
+    pub fn with_wraparound_hint(self, wraparound_hint: impl Into<Interval>) -> 
Self {
+        Self {
+            wraparound_hint: wraparound_hint.into(),
+            ..self
+        }
+    }
+
+    /// Calculate the final xmin and xmax for geometries encountered by this 
bounder
+    ///
+    /// The interval returned may wraparound if a hint was set and the input
+    /// encountered by this bounder were exclusively at the far left and far 
right
+    /// of the input range. See [IntervalTrait] for an in-depth description of
+    /// wraparound intervals.
+    pub fn x(&self) -> WraparoundInterval {
+        let out_all = Interval::empty()
+            .merge_interval(&self.x_left)
+            .merge_interval(&self.x_mid)
+            .merge_interval(&self.x_right);
+
+        // Check if this even makes sense: if anything is covering the midpoint
+        // of the wraparound hint or the bounds don't make sense for the 
provided
+        // wraparound hint, just return the Cartesian bounds.
+        if !self.x_mid.is_empty() || 
!self.wraparound_hint.contains_interval(&out_all) {
+            return out_all.into();
+        }
+
+        // Check if our wraparound bounds are any better than our Cartesian 
bounds
+        // If the Cartesian bounds are tighter, return them.
+        let out_width = (self.x_left.hi() - self.wraparound_hint.lo())
+            + (self.wraparound_hint.hi() - self.x_right.hi());
+        if out_all.width() < out_width {
+            return out_all.into();
+        }
+
+        // Wraparound!
+        WraparoundInterval::new(self.x_right.lo(), self.x_left.hi())
+    }
+
+    /// Calculate the final ymin and ymax for geometries encountered by this 
bounder
+    pub fn y(&self) -> Interval {
+        self.y
+    }
+
+    /// Calculate the final zmin and zmax for geometries encountered by this 
bounder
+    pub fn z(&self) -> Interval {
+        self.z
+    }
+
+    /// Calculate the final mmin and mmax values for geometries encountered by 
this bounder
+    pub fn m(&self) -> Interval {
+        self.m
+    }
+
+    /// Calculate the final geometry type set
+    ///
+    /// Returns a copy of the unique geometry type/dimension combinations 
encountered
+    /// by this bounder. These identifiers are ISO WKB identifiers (e.g., 1001
+    /// for PointZ). The output is always returned sorted.
+    pub fn geometry_types(&self) -> Vec<i32> {
+        let mut out = self.geometry_types.iter().cloned().collect::<Vec<_>>();
+        out.sort();
+        out
+    }
+
+    /// Update this bounder with one WKB-encoded geometry
+    ///
+    /// Parses and accumulates the bounds of one WKB-encoded geometry. This 
function
+    /// will error for invalid WKB input; however, clients may wish to ignore 
such
+    /// an error for the purposes of writing statistics.
+    pub fn update_wkb(&mut self, wkb: &[u8]) -> Result<(), ArrowError> {
+        let wkb = Wkb::try_new(wkb).map_err(|e| 
ArrowError::ExternalError(Box::new(e)))?;
+        self.update_geometry(&wkb)?;
+        Ok(())
+    }
+
+    fn update_geometry(&mut self, geom: &impl GeometryTrait<T = f64>) -> 
Result<(), ArrowError> {
+        let geometry_type = geometry_type(geom)?;
+        self.geometry_types.insert(geometry_type);
+
+        visit_intervals(geom, 'x', &mut |x| self.update_x(&x))?;
+        visit_intervals(geom, 'y', &mut |y| self.y.update_interval(&y))?;
+        visit_intervals(geom, 'z', &mut |z| self.z.update_interval(&z))?;
+        visit_intervals(geom, 'm', &mut |m| self.m.update_interval(&m))?;
+
+        Ok(())
+    }
+
+    fn update_x(&mut self, x: &Interval) {
+        if x.hi() < self.wraparound_hint.mid() {
+            // If the x interval is completely to the left of the midpoint, 
merge it
+            // with x_left
+            self.x_left.update_interval(x);
+        } else if x.lo() > self.wraparound_hint.mid() {
+            // If the x interval is completely to the right of the midpoint, 
merge it
+            // with x_right
+            self.x_right.update_interval(x);
+        } else {
+            // Otherwise, merge it with x_mid
+            self.x_mid.update_interval(x);
+        }
+    }
+}
+
+/// Visit contiguous intervals for a given dimension within a [GeometryTrait]
+///
+/// Here, contiguous intervals refers to intervals that must not be separated
+/// by wraparound bounding. Point components of a geometry are visited as
+/// degenerate intervals of a single value; linestring or polygon ring 
components
+/// are visited as single intervals.
+fn visit_intervals(
+    geom: &impl GeometryTrait<T = f64>,
+    dimension: char,
+    func: &mut impl FnMut(Interval),
+) -> Result<(), ArrowError> {
+    let n = if let Some(n) = dimension_index(geom.dim(), dimension) {
+        n
+    } else {
+        return Ok(());
+    };
+
+    match geom.as_type() {
+        GeometryType::Point(pt) => {
+            if let Some(coord) = PointTrait::coord(pt) {
+                visit_point(coord, n, func);
+            }
+        }
+        GeometryType::LineString(ls) => {
+            visit_sequence(ls.coords(), n, func);
+        }
+        GeometryType::Polygon(pl) => {
+            if let Some(exterior) = pl.exterior() {
+                visit_sequence(exterior.coords(), n, func);
+            }
+
+            for interior in pl.interiors() {
+                visit_sequence(interior.coords(), n, func);
+            }
+        }
+        GeometryType::MultiPoint(multi_pt) => {
+            visit_collection(multi_pt.points(), dimension, func)?;
+        }
+        GeometryType::MultiLineString(multi_ls) => {
+            visit_collection(multi_ls.line_strings(), dimension, func)?;
+        }
+        GeometryType::MultiPolygon(multi_pl) => {
+            visit_collection(multi_pl.polygons(), dimension, func)?;
+        }
+        GeometryType::GeometryCollection(collection) => {
+            visit_collection(collection.geometries(), dimension, func)?;
+        }
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(
+                "GeometryType not supported for dimension bounds".to_string(),
+            ))
+        }
+    }
+
+    Ok(())
+}
+
+/// Visit a point
+///
+/// Points can be separated by wraparound bounding even if they occur within
+/// the same feature, so we visit them as individual degenerate intervals.
+fn visit_point(coord: impl CoordTrait<T = f64>, n: usize, func: &mut impl 
FnMut(Interval)) {
+    let val = unsafe { coord.nth_unchecked(n) };
+    func((val, val).into());
+}
+
+/// Visit contiguous sequences
+///
+/// Sequences (e.g., linestrings or polygon rings) must always be considered
+/// together (i.e., are never separated by wraparound bounding).
+fn visit_sequence(
+    coords: impl IntoIterator<Item = impl CoordTrait<T = f64>>,
+    n: usize,
+    func: &mut impl FnMut(Interval),
+) {
+    let mut interval = Interval::empty();
+    for coord in coords {
+        interval.update_value(unsafe { coord.nth_unchecked(n) });
+    }
+
+    func(interval);
+}
+
+/// Visit intervals in a collection of geometries
+fn visit_collection(
+    collection: impl IntoIterator<Item = impl GeometryTrait<T = f64>>,
+    target: char,
+    func: &mut impl FnMut(Interval),
+) -> Result<(), ArrowError> {
+    for geom in collection {
+        visit_intervals(&geom, target, func)?;
+    }
+
+    Ok(())
+}
+
+fn geometry_type(geom: &impl GeometryTrait<T = f64>) -> Result<i32, 
ArrowError> {

Review Comment:
   It would be _nice_ to have an enum here. I get that it could make 
inter-crate workings more complex, so we can revisit it in the future



##########
parquet-geospatial/src/bounding.rs:
##########
@@ -0,0 +1,602 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashSet;
+
+use arrow_schema::ArrowError;
+use geo_traits::{
+    CoordTrait, Dimensions, GeometryCollectionTrait, GeometryTrait, 
GeometryType, LineStringTrait,
+    MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, PointTrait, 
PolygonTrait,
+};
+use wkb::reader::Wkb;
+
+use crate::interval::{Interval, IntervalTrait, WraparoundInterval};
+
+/// Geometry bounder
+///
+/// Utility to accumulate statistics for geometries as they are written.
+/// This bounder is designed to output statistics accumulated according
+/// to the Parquet specification such that the output can be written to
+/// Parquet statistics with minimal modification.
+///
+/// See the [IntervalTrait] for an in-depth discussion of wraparound bounding
+/// (which adds some complexity to this implementation).
+#[derive(Debug)]
+pub struct GeometryBounder {
+    /// Union of all contiguous x intervals to the left of the wraparound 
midpoint
+    x_left: Interval,
+    /// Union of all contiguous x intervals that intersect the wraparound 
midpoint
+    x_mid: Interval,
+    /// Union of all contiguous x intervals to the right of the wraparound 
midpoint
+    x_right: Interval,
+    /// Union of all y intervals
+    y: Interval,
+    /// Union of all z intervals
+    z: Interval,
+    /// Union of all m intervals
+    m: Interval,
+    /// Unique geometry type codes encountered by the bounder
+    ///
+    /// The integer codes are identical to the ISO WKB geometry type codes and
+    /// are documented as part of the Parquet specification:
+    /// 
<https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types>
+    geometry_types: HashSet<i32>,
+    wraparound_hint: Interval,
+}
+
+impl GeometryBounder {
+    /// Create a new, empty bounder that represents empty input
+    pub fn empty() -> Self {
+        Self {
+            x_left: Interval::empty(),
+            x_mid: Interval::empty(),
+            x_right: Interval::empty(),
+            y: Interval::empty(),
+            z: Interval::empty(),
+            m: Interval::empty(),
+            geometry_types: HashSet::<i32>::default(),
+            wraparound_hint: Interval::empty(),
+        }
+    }
+
+    /// Set the hint to use for generation of potential wraparound xmin/xmax 
output
+    ///
+    /// Usually this value should be set to (-180, 180), as wraparound is 
primarily
+    /// targeted at lon/lat coordinate systems where collections of features 
with
+    /// components at the very far left and very far right of the coordinate 
system
+    /// are actually very close to each other.
+    ///
+    /// It is safe to set this value even when the actual coordinate system of 
the
+    /// input is unknown: if the input has coordinate values that are outside 
the
+    /// range of the wraparound hint, wraparound xmin/xmax values will not be
+    /// generated. If the input has coordinate values that are well inside of 
the
+    /// range of the wraparound hint, the wraparound xmin/xmax value will be
+    /// substantially wider than the non-wraparound version and will not be 
returned.
+    pub fn with_wraparound_hint(self, wraparound_hint: impl Into<Interval>) -> 
Self {
+        Self {
+            wraparound_hint: wraparound_hint.into(),
+            ..self
+        }
+    }
+
+    /// Calculate the final xmin and xmax for geometries encountered by this 
bounder
+    ///
+    /// The interval returned may wraparound if a hint was set and the input
+    /// encountered by this bounder were exclusively at the far left and far 
right
+    /// of the input range. See [IntervalTrait] for an in-depth description of
+    /// wraparound intervals.
+    pub fn x(&self) -> WraparoundInterval {
+        let out_all = Interval::empty()
+            .merge_interval(&self.x_left)
+            .merge_interval(&self.x_mid)
+            .merge_interval(&self.x_right);
+
+        // Check if this even makes sense: if anything is covering the midpoint
+        // of the wraparound hint or the bounds don't make sense for the 
provided
+        // wraparound hint, just return the Cartesian bounds.
+        if !self.x_mid.is_empty() || 
!self.wraparound_hint.contains_interval(&out_all) {
+            return out_all.into();
+        }
+
+        // Check if our wraparound bounds are any better than our Cartesian 
bounds
+        // If the Cartesian bounds are tighter, return them.
+        let out_width = (self.x_left.hi() - self.wraparound_hint.lo())
+            + (self.wraparound_hint.hi() - self.x_right.hi());
+        if out_all.width() < out_width {
+            return out_all.into();
+        }
+
+        // Wraparound!
+        WraparoundInterval::new(self.x_right.lo(), self.x_left.hi())
+    }
+
+    /// Calculate the final ymin and ymax for geometries encountered by this 
bounder
+    pub fn y(&self) -> Interval {
+        self.y
+    }
+
+    /// Calculate the final zmin and zmax for geometries encountered by this 
bounder
+    pub fn z(&self) -> Interval {
+        self.z
+    }
+
+    /// Calculate the final mmin and mmax values for geometries encountered by 
this bounder
+    pub fn m(&self) -> Interval {
+        self.m
+    }
+
+    /// Calculate the final geometry type set
+    ///
+    /// Returns a copy of the unique geometry type/dimension combinations 
encountered
+    /// by this bounder. These identifiers are ISO WKB identifiers (e.g., 1001
+    /// for PointZ). The output is always returned sorted.
+    pub fn geometry_types(&self) -> Vec<i32> {
+        let mut out = self.geometry_types.iter().cloned().collect::<Vec<_>>();
+        out.sort();
+        out
+    }
+
+    /// Update this bounder with one WKB-encoded geometry

Review Comment:
   Question for the future: will it ever make sense to be able to update the 
bounder before values have been encoded to WKB? It makes sense that it's 
simplest at this step of the process to intercept the WKB, though it's a small 
amount of overhead to have to parse the unaligned floats here, right? Maybe 
that amount of overhead isn't worth optimizing



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to