wgtmac commented on code in PR #2971:
URL: https://github.com/apache/parquet-java/pull/2971#discussion_r1720600186


##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java:
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import org.apache.parquet.Preconditions;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+
+public class BoundingBox {
+
+  private double xMin = Double.POSITIVE_INFINITY;
+  private double xMax = Double.NEGATIVE_INFINITY;
+  private double yMin = Double.POSITIVE_INFINITY;
+  private double yMax = Double.NEGATIVE_INFINITY;
+  private double zMin = Double.POSITIVE_INFINITY;
+  private double zMax = Double.NEGATIVE_INFINITY;
+  private double mMin = Double.POSITIVE_INFINITY;
+  private double mMax = Double.NEGATIVE_INFINITY;
+
+  public BoundingBox(
+      double xMin, double xMax, double yMin, double yMax, double zMin, double 
zMax, double mMin, double mMax) {
+    this.xMin = xMin;
+    this.xMax = xMax;
+    this.yMin = yMin;
+    this.yMax = yMax;
+    this.zMin = zMin;
+    this.zMax = zMax;
+    this.mMin = mMin;
+    this.mMax = mMax;
+  }
+
+  public BoundingBox() {}
+
+  public double getXMin() {
+    return xMin;
+  }
+
+  public double getXMax() {
+    return xMax;
+  }
+
+  public double getYMin() {
+    return yMin;
+  }
+
+  public double getYMax() {
+    return yMax;
+  }
+
+  public double getZMin() {
+    return zMin;
+  }
+
+  public double getZMax() {
+    return zMax;
+  }
+
+  public double getMMin() {
+    return mMin;
+  }
+
+  public double getMMax() {
+    return mMax;
+  }
+
+  void update(double minX, double maxX, double minY, double maxY, double minZ, 
double maxZ) {
+    xMin = Math.min(xMin, minX);
+    yMin = Math.min(yMin, minY);
+    xMax = Math.max(xMax, maxX);
+    yMax = Math.max(yMax, maxY);
+    zMin = Math.min(zMin, minZ);
+    zMax = Math.max(zMax, maxZ);
+  }
+
+  void update(Geometry geometry) {
+    GeometryUtils.normalizeLongitude(geometry);
+    Envelope envelope = geometry.getEnvelopeInternal();
+    double minX = envelope.getMinX();
+    double minY = envelope.getMinY();
+    double maxX = envelope.getMaxX();
+    double maxY = envelope.getMaxY();
+
+    // JTS (Java Topology Suite) does not handle Z-coordinates directly in the 
Envelope class
+    // because it's primarily used for 2D geometries. However, we can iterate 
through the
+    // coordinates of the geometry to find the minimum and maximum Z values.
+    double minZ = Double.POSITIVE_INFINITY;
+    double maxZ = Double.NEGATIVE_INFINITY;
+
+    Coordinate[] coordinates = geometry.getCoordinates();
+    for (Coordinate coord : coordinates) {
+      if (!Double.isNaN(coord.getZ())) {
+        // Update zMin and zMax by iterating through the coordinates.
+        minZ = Math.min(minZ, coord.getZ());
+        maxZ = Math.max(maxZ, coord.getZ());
+      }
+    }
+
+    update(minX, maxX, minY, maxY, minZ, maxZ);
+  }
+
+  // Method to merge a Geometry object into this bounding box
+  public void merge(Geometry geometry) {
+    Preconditions.checkArgument(geometry != null, "Cannot merge with null 
geometry");
+    GeometryUtils.normalizeLongitude(geometry);

Review Comment:
   This is redundant because it will be called in `update(geometry)` again.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/Covering.java:
##########
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import org.apache.parquet.Preconditions;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+
+public class Covering {
+  public static final String DEFAULT_COVERING_KIND = "WKB";
+
+  protected final String kind;
+  protected ByteBuffer value;
+
+  public Covering(ByteBuffer value, String kind) {
+    Preconditions.checkArgument(kind != null, "kind cannot be null");
+    Preconditions.checkArgument(kind.equalsIgnoreCase(DEFAULT_COVERING_KIND), 
"kind only accepts WKB");
+    Preconditions.checkArgument(value != null, "value cannot be null");
+    this.value = value;
+    this.kind = kind;
+  }
+
+  public ByteBuffer getValue() {
+    return value;
+  }
+
+  public String getKind() {
+    return kind;
+  }
+
+  void update(Geometry geom) {
+    throw new UnsupportedOperationException(
+        "Update is not supported for " + this.getClass().getSimpleName());
+  }
+
+  public void merge(Covering other) {
+    throw new UnsupportedOperationException(
+        "Merge is not supported for " + this.getClass().getSimpleName());
+  }
+
+  public void reset() {
+    throw new UnsupportedOperationException(
+        "Reset is not supported for " + this.getClass().getSimpleName());
+  }
+
+  public void abort() {
+    throw new UnsupportedOperationException(
+        "Abort is not supported for " + this.getClass().getSimpleName());
+  }
+
+  public Covering copy() {
+    return new Covering(value.duplicate(), kind);
+  }
+
+  @Override
+  public String toString() {

Review Comment:
   Originally the spec was designed to support WKB encoding only, so I 
implemented `toString()` here by assuming the binary data is WKB. Now we need 
to move it to the subclass as it may accept other encoding kind in the future.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/EnvelopeCovering.java:
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.geom.GeometryFactory;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+import org.locationtech.jts.io.WKBWriter;
+
+public class EnvelopeCovering extends Covering {
+
+  // The POC only supports EPSG:3857 and EPSG:4326 at the moment
+  private static final List<String> SUPPORTED_CRS = Arrays.asList("EPSG:3857", 
"EPSG:4326");
+
+  private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]);
+  private final WKBReader reader = new WKBReader();
+  private final WKBWriter writer = new WKBWriter();
+  private final GeometryFactory factory = new GeometryFactory();
+  private final LogicalTypeAnnotation.Edges edges;
+  private final String crs;
+
+  public EnvelopeCovering(LogicalTypeAnnotation.Edges edges, String crs) {
+    super(EMPTY, DEFAULT_COVERING_KIND);
+    this.edges = edges;
+    this.crs = crs;
+    validateSupportedCrs(crs);

Review Comment:
   Is it better to disable building the covering for unsupported CRS instead of 
throwing exception? The covering is an optional feature any way. We can add a 
warning log in this case.



##########
parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java:
##########
@@ -534,9 +584,8 @@ private void addRowGroup(
     int columnOrdinal = -1;
     ByteArrayOutputStream tempOutStream = null;
     for (ColumnChunkMetaData columnMetaData : columns) {
-      // There is no ColumnMetaData written after the chunk data, so set the 
ColumnChunk
-      // file_offset to 0
-      ColumnChunk columnChunk = new ColumnChunk(0);
+      ColumnChunk columnChunk =

Review Comment:
   It seems that you have not rebased on the latest master branch. Please 
revert these two lines.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java:
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import org.apache.parquet.Preconditions;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+
+public class BoundingBox {
+
+  private double xMin = Double.POSITIVE_INFINITY;
+  private double xMax = Double.NEGATIVE_INFINITY;
+  private double yMin = Double.POSITIVE_INFINITY;
+  private double yMax = Double.NEGATIVE_INFINITY;
+  private double zMin = Double.POSITIVE_INFINITY;
+  private double zMax = Double.NEGATIVE_INFINITY;
+  private double mMin = Double.POSITIVE_INFINITY;
+  private double mMax = Double.NEGATIVE_INFINITY;
+
+  public BoundingBox(
+      double xMin, double xMax, double yMin, double yMax, double zMin, double 
zMax, double mMin, double mMax) {
+    this.xMin = xMin;
+    this.xMax = xMax;
+    this.yMin = yMin;
+    this.yMax = yMax;
+    this.zMin = zMin;
+    this.zMax = zMax;
+    this.mMin = mMin;
+    this.mMax = mMax;
+  }
+
+  public BoundingBox() {}
+
+  public double getXMin() {
+    return xMin;
+  }
+
+  public double getXMax() {
+    return xMax;
+  }
+
+  public double getYMin() {
+    return yMin;
+  }
+
+  public double getYMax() {
+    return yMax;
+  }
+
+  public double getZMin() {
+    return zMin;
+  }
+
+  public double getZMax() {
+    return zMax;
+  }
+
+  public double getMMin() {
+    return mMin;
+  }
+
+  public double getMMax() {
+    return mMax;
+  }
+
+  void update(double minX, double maxX, double minY, double maxY, double minZ, 
double maxZ) {
+    xMin = Math.min(xMin, minX);
+    yMin = Math.min(yMin, minY);
+    xMax = Math.max(xMax, maxX);
+    yMax = Math.max(yMax, maxY);
+    zMin = Math.min(zMin, minZ);
+    zMax = Math.max(zMax, maxZ);
+  }
+
+  void update(Geometry geometry) {
+    GeometryUtils.normalizeLongitude(geometry);
+    Envelope envelope = geometry.getEnvelopeInternal();
+    double minX = envelope.getMinX();
+    double minY = envelope.getMinY();
+    double maxX = envelope.getMaxX();
+    double maxY = envelope.getMaxY();
+
+    // JTS (Java Topology Suite) does not handle Z-coordinates directly in the 
Envelope class
+    // because it's primarily used for 2D geometries. However, we can iterate 
through the
+    // coordinates of the geometry to find the minimum and maximum Z values.
+    double minZ = Double.POSITIVE_INFINITY;
+    double maxZ = Double.NEGATIVE_INFINITY;
+
+    Coordinate[] coordinates = geometry.getCoordinates();
+    for (Coordinate coord : coordinates) {
+      if (!Double.isNaN(coord.getZ())) {
+        // Update zMin and zMax by iterating through the coordinates.
+        minZ = Math.min(minZ, coord.getZ());
+        maxZ = Math.max(maxZ, coord.getZ());
+      }
+    }
+
+    update(minX, maxX, minY, maxY, minZ, maxZ);
+  }
+
+  // Method to merge a Geometry object into this bounding box
+  public void merge(Geometry geometry) {

Review Comment:
   ```suggestion
     void merge(Geometry geometry) {
   ```
   
   Please do not expose method with JTS Geometry as public API.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/EnvelopeCovering.java:
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.geom.GeometryFactory;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+import org.locationtech.jts.io.WKBWriter;
+
+public class EnvelopeCovering extends Covering {
+
+  // The POC only supports EPSG:3857 and EPSG:4326 at the moment
+  private static final List<String> SUPPORTED_CRS = Arrays.asList("EPSG:3857", 
"EPSG:4326");
+
+  private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]);
+  private final WKBReader reader = new WKBReader();
+  private final WKBWriter writer = new WKBWriter();
+  private final GeometryFactory factory = new GeometryFactory();
+  private final LogicalTypeAnnotation.Edges edges;
+  private final String crs;
+
+  public EnvelopeCovering(LogicalTypeAnnotation.Edges edges, String crs) {
+    super(EMPTY, DEFAULT_COVERING_KIND);
+    this.edges = edges;
+    this.crs = crs;
+    validateSupportedCrs(crs);
+  }
+
+  private void validateSupportedCrs(String crs) {

Review Comment:
   nit: make it static



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryStatistics.java:
##########
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeometryStatistics {
+
+  // Metadata that may impact the statistics calculation
+  private final LogicalTypeAnnotation.Edges edges;
+  private final String crs;
+  private final ByteBuffer metadata;
+
+  private final BoundingBox boundingBox;
+  private final List<Covering> coverings;
+  private final GeometryTypes geometryTypes;
+  private final WKBReader reader = new WKBReader();
+
+  public GeometryStatistics(
+      LogicalTypeAnnotation.Edges edges,
+      String crs,
+      ByteBuffer metadata,
+      BoundingBox boundingBox,
+      List<Covering> coverings,
+      GeometryTypes geometryTypes) {
+    this.edges = edges;
+    this.crs = crs;
+    this.metadata = metadata;
+    this.boundingBox = supportsBoundingBox() ? boundingBox : null;
+    this.coverings = supportsCovering() ? coverings : null;
+    this.geometryTypes = geometryTypes;
+  }
+
+  public GeometryStatistics(LogicalTypeAnnotation.Edges edges, String crs, 
ByteBuffer metadata) {
+    this(
+        edges,
+        crs,
+        metadata,
+        new BoundingBox(),
+        Arrays.asList(new EnvelopeCovering(edges, crs)),
+        new GeometryTypes());
+  }
+
+  public BoundingBox getBoundingBox() {
+    return boundingBox;
+  }
+
+  public List<Covering> getCoverings() {
+    return coverings;
+  }
+
+  public GeometryTypes getGeometryTypes() {
+    return geometryTypes;
+  }
+
+  public void update(Binary value) {
+    if (value == null) {
+      return;
+    }
+    try {
+      Geometry geom = reader.read(value.getBytes());
+      update(geom);
+    } catch (ParseException e) {
+      abort();
+    }
+  }
+
+  private void update(Geometry geom) {
+    if (supportsBoundingBox()) {
+      boundingBox.update(geom);
+    }
+    if (supportsCovering()) {
+      coverings.stream().forEach(c -> c.update(geom));
+    }
+    geometryTypes.update(geom);
+  }
+
+  /**
+   * A bounding box is a rectangular region defined by two points, the lower 
left
+   * and upper right corners. It is used to represent the minimum and maximum
+   * coordinates of a geometry. Only planar geometries can have a bounding box.
+   */
+  private boolean supportsBoundingBox() {
+    // Only planar geometries can have a bounding box
+    // based on the current specification
+    return edges == LogicalTypeAnnotation.Edges.PLANAR;
+  }
+
+  /**
+   * A custom WKB-encoded polygon or multi-polygon to represent a covering of
+   * geometries. For example, it may be a bounding box, or an envelope of 
geometries
+   * when a bounding box cannot be built (e.g. a geometry has spherical edges, 
or if
+   * an edge of geographic coordinates crosses the antimeridian). In addition, 
it can
+   * also be used to provide vendor-agnostic coverings like S2 or H3 grids.
+   */
+  private boolean supportsCovering() {
+    // This version (POC) assumes only build coverings for planar edges
+    // In case of spherical edges, no coverings are built
+    //     return edges == LogicalTypeAnnotation.Edges.PLANAR;
+    return true;
+  }
+
+  public void merge(GeometryStatistics other) {
+    Preconditions.checkArgument(other != null, "Cannot merge with null 
GeometryStatistics");
+    Preconditions.checkArgument(coverings.size() == other.coverings.size(), 
"Coverings size must be the same");
+
+    boundingBox.merge(other.boundingBox);
+    for (int i = 0; i < coverings.size(); i++) {
+      coverings.get(i).merge(other.coverings.get(i));

Review Comment:
   This is hacky. Should we organize the coverings as `Map<String, Covering>` 
whose key is the encoding kind?
   
   The `merge` method will be used by `ParquetRewriter` which merges several 
different parquet files into a single one. Those parquet files may be written 
by other parquet implementations so we cannot assume these coverings are in 
order.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/Covering.java:
##########
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import org.apache.parquet.Preconditions;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+
+public class Covering {
+  public static final String DEFAULT_COVERING_KIND = "WKB";
+
+  protected final String kind;
+  protected ByteBuffer value;
+
+  public Covering(ByteBuffer value, String kind) {
+    Preconditions.checkArgument(kind != null, "kind cannot be null");
+    Preconditions.checkArgument(kind.equalsIgnoreCase(DEFAULT_COVERING_KIND), 
"kind only accepts WKB");

Review Comment:
   Is this better to move this check to the subclass?



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryUtils.java:
##########
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.statistics.geometry;
+
+import org.locationtech.jts.geom.CoordinateSequence;
+import org.locationtech.jts.geom.CoordinateSequenceFilter;
+import org.locationtech.jts.geom.Geometry;
+
+public class GeometryUtils {

Review Comment:
   Should we make it package private? We have to guarantee API compatibility so 
it is good to control the visibility as much as possible.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java:
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import org.apache.parquet.Preconditions;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+
+public class BoundingBox {
+
+  private double xMin = Double.POSITIVE_INFINITY;
+  private double xMax = Double.NEGATIVE_INFINITY;
+  private double yMin = Double.POSITIVE_INFINITY;
+  private double yMax = Double.NEGATIVE_INFINITY;
+  private double zMin = Double.POSITIVE_INFINITY;
+  private double zMax = Double.NEGATIVE_INFINITY;
+  private double mMin = Double.POSITIVE_INFINITY;
+  private double mMax = Double.NEGATIVE_INFINITY;
+
+  public BoundingBox(
+      double xMin, double xMax, double yMin, double yMax, double zMin, double 
zMax, double mMin, double mMax) {
+    this.xMin = xMin;
+    this.xMax = xMax;
+    this.yMin = yMin;
+    this.yMax = yMax;
+    this.zMin = zMin;
+    this.zMax = zMax;
+    this.mMin = mMin;
+    this.mMax = mMax;
+  }
+
+  public BoundingBox() {}
+
+  public double getXMin() {
+    return xMin;
+  }
+
+  public double getXMax() {
+    return xMax;
+  }
+
+  public double getYMin() {
+    return yMin;
+  }
+
+  public double getYMax() {
+    return yMax;
+  }
+
+  public double getZMin() {
+    return zMin;
+  }
+
+  public double getZMax() {
+    return zMax;
+  }
+
+  public double getMMin() {
+    return mMin;
+  }
+
+  public double getMMax() {
+    return mMax;
+  }
+
+  void update(double minX, double maxX, double minY, double maxY, double minZ, 
double maxZ) {
+    xMin = Math.min(xMin, minX);
+    yMin = Math.min(yMin, minY);
+    xMax = Math.max(xMax, maxX);
+    yMax = Math.max(yMax, maxY);
+    zMin = Math.min(zMin, minZ);
+    zMax = Math.max(zMax, maxZ);
+  }
+
+  void update(Geometry geometry) {

Review Comment:
   nit: add a comment to say that `geometry` will be changed by this method



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryStatistics.java:
##########
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeometryStatistics {
+
+  // Metadata that may impact the statistics calculation
+  private final LogicalTypeAnnotation.Edges edges;
+  private final String crs;
+  private final ByteBuffer metadata;
+
+  private final BoundingBox boundingBox;
+  private final List<Covering> coverings;
+  private final GeometryTypes geometryTypes;
+  private final WKBReader reader = new WKBReader();
+
+  public GeometryStatistics(
+      LogicalTypeAnnotation.Edges edges,
+      String crs,
+      ByteBuffer metadata,
+      BoundingBox boundingBox,
+      List<Covering> coverings,
+      GeometryTypes geometryTypes) {
+    this.edges = edges;
+    this.crs = crs;
+    this.metadata = metadata;
+    this.boundingBox = supportsBoundingBox() ? boundingBox : null;
+    this.coverings = supportsCovering() ? coverings : null;
+    this.geometryTypes = geometryTypes;
+  }
+
+  public GeometryStatistics(LogicalTypeAnnotation.Edges edges, String crs, 
ByteBuffer metadata) {
+    this(
+        edges,
+        crs,
+        metadata,
+        new BoundingBox(),
+        Arrays.asList(new EnvelopeCovering(edges, crs)),
+        new GeometryTypes());
+  }
+
+  public BoundingBox getBoundingBox() {
+    return boundingBox;
+  }
+
+  public List<Covering> getCoverings() {
+    return coverings;
+  }
+
+  public GeometryTypes getGeometryTypes() {
+    return geometryTypes;
+  }
+
+  public void update(Binary value) {
+    if (value == null) {
+      return;
+    }
+    try {
+      Geometry geom = reader.read(value.getBytes());
+      update(geom);
+    } catch (ParseException e) {
+      abort();
+    }
+  }
+
+  private void update(Geometry geom) {
+    if (supportsBoundingBox()) {
+      boundingBox.update(geom);
+    }
+    if (supportsCovering()) {
+      coverings.stream().forEach(c -> c.update(geom));
+    }
+    geometryTypes.update(geom);
+  }
+
+  /**
+   * A bounding box is a rectangular region defined by two points, the lower 
left
+   * and upper right corners. It is used to represent the minimum and maximum
+   * coordinates of a geometry. Only planar geometries can have a bounding box.
+   */
+  private boolean supportsBoundingBox() {
+    // Only planar geometries can have a bounding box
+    // based on the current specification
+    return edges == LogicalTypeAnnotation.Edges.PLANAR;
+  }
+
+  /**
+   * A custom WKB-encoded polygon or multi-polygon to represent a covering of
+   * geometries. For example, it may be a bounding box, or an envelope of 
geometries
+   * when a bounding box cannot be built (e.g. a geometry has spherical edges, 
or if
+   * an edge of geographic coordinates crosses the antimeridian). In addition, 
it can
+   * also be used to provide vendor-agnostic coverings like S2 or H3 grids.
+   */
+  private boolean supportsCovering() {
+    // This version (POC) assumes only build coverings for planar edges
+    // In case of spherical edges, no coverings are built
+    //     return edges == LogicalTypeAnnotation.Edges.PLANAR;
+    return true;

Review Comment:
   As I have commented in the `EnvelopeCovering`, we'd better disable building 
covering for spherical edges. Therefore we cannot always return true here.



##########
parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryStatistics.java:
##########
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.statistics.geometry;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeometryStatistics {
+
+  // Metadata that may impact the statistics calculation
+  private final LogicalTypeAnnotation.Edges edges;
+  private final String crs;
+  private final ByteBuffer metadata;
+
+  private final BoundingBox boundingBox;
+  private final List<Covering> coverings;
+  private final GeometryTypes geometryTypes;
+  private final WKBReader reader = new WKBReader();
+
+  public GeometryStatistics(
+      LogicalTypeAnnotation.Edges edges,
+      String crs,
+      ByteBuffer metadata,
+      BoundingBox boundingBox,
+      List<Covering> coverings,
+      GeometryTypes geometryTypes) {
+    this.edges = edges;
+    this.crs = crs;
+    this.metadata = metadata;
+    this.boundingBox = supportsBoundingBox() ? boundingBox : null;
+    this.coverings = supportsCovering() ? coverings : null;
+    this.geometryTypes = geometryTypes;
+  }
+
+  public GeometryStatistics(LogicalTypeAnnotation.Edges edges, String crs, 
ByteBuffer metadata) {
+    this(
+        edges,
+        crs,
+        metadata,
+        new BoundingBox(),
+        Arrays.asList(new EnvelopeCovering(edges, crs)),
+        new GeometryTypes());
+  }
+
+  public BoundingBox getBoundingBox() {
+    return boundingBox;
+  }
+
+  public List<Covering> getCoverings() {
+    return coverings;
+  }
+
+  public GeometryTypes getGeometryTypes() {
+    return geometryTypes;
+  }
+
+  public void update(Binary value) {
+    if (value == null) {
+      return;
+    }
+    try {
+      Geometry geom = reader.read(value.getBytes());
+      update(geom);
+    } catch (ParseException e) {
+      abort();
+    }
+  }
+
+  private void update(Geometry geom) {
+    if (supportsBoundingBox()) {
+      boundingBox.update(geom);
+    }
+    if (supportsCovering()) {
+      coverings.stream().forEach(c -> c.update(geom));
+    }
+    geometryTypes.update(geom);
+  }
+
+  /**
+   * A bounding box is a rectangular region defined by two points, the lower 
left
+   * and upper right corners. It is used to represent the minimum and maximum
+   * coordinates of a geometry. Only planar geometries can have a bounding box.
+   */
+  private boolean supportsBoundingBox() {
+    // Only planar geometries can have a bounding box
+    // based on the current specification
+    return edges == LogicalTypeAnnotation.Edges.PLANAR;
+  }
+
+  /**
+   * A custom WKB-encoded polygon or multi-polygon to represent a covering of
+   * geometries. For example, it may be a bounding box, or an envelope of 
geometries
+   * when a bounding box cannot be built (e.g. a geometry has spherical edges, 
or if
+   * an edge of geographic coordinates crosses the antimeridian). In addition, 
it can
+   * also be used to provide vendor-agnostic coverings like S2 or H3 grids.
+   */
+  private boolean supportsCovering() {
+    // This version (POC) assumes only build coverings for planar edges
+    // In case of spherical edges, no coverings are built
+    //     return edges == LogicalTypeAnnotation.Edges.PLANAR;
+    return true;
+  }
+
+  public void merge(GeometryStatistics other) {
+    Preconditions.checkArgument(other != null, "Cannot merge with null 
GeometryStatistics");
+    Preconditions.checkArgument(coverings.size() == other.coverings.size(), 
"Coverings size must be the same");
+
+    boundingBox.merge(other.boundingBox);
+    for (int i = 0; i < coverings.size(); i++) {
+      coverings.get(i).merge(other.coverings.get(i));
+    }
+    geometryTypes.merge(other.geometryTypes);
+  }
+
+  public void reset() {
+    boundingBox.reset();
+    coverings.stream().forEach(c -> c.reset());
+    geometryTypes.reset();
+  }
+
+  public void abort() {
+    boundingBox.abort();
+    coverings.stream().forEach(c -> c.abort());
+    geometryTypes.abort();
+  }
+
+  // Copy the statistics
+  public GeometryStatistics copy() {
+    return new GeometryStatistics(
+        edges,
+        crs,
+        metadata,
+        boundingBox != null ? boundingBox.copy() : null,
+        coverings != null ? coverings : null,
+        geometryTypes != null ? geometryTypes.copy() : null);
+  }
+
+  @Override
+  public String toString() {
+    return "GeometryStatistics{" + "boundingBox="
+        + boundingBox + ", coverings="
+        + coverings + ", geometryTypes="

Review Comment:
   The default toString() for `coverings` is not human-readable. We need to 
print them one by one.



##########
parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestGeometryTypeRoundTrip.java:
##########
@@ -0,0 +1,235 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+package org.apache.parquet.statistics;
+
+import static org.apache.parquet.schema.LogicalTypeAnnotation.geometryType;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+
+import java.io.*;
+import java.nio.file.Path;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.column.statistics.BinaryStatistics;
+import org.apache.parquet.column.statistics.geometry.GeometryStatistics;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.GroupFactory;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.io.LocalInputFile;
+import org.apache.parquet.io.LocalOutputFile;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.LogicalTypeAnnotation.Edges;
+import org.apache.parquet.schema.LogicalTypeAnnotation.GeometryEncoding;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.Types;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.GeometryFactory;
+import org.locationtech.jts.io.WKBWriter;
+
+public class TestGeometryTypeRoundTrip {
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  private Path newTempPath() throws IOException {
+    File file = temp.newFile();
+    Preconditions.checkArgument(file.delete(), "Could not remove temp file");
+    return file.toPath();
+  }
+
+  @Test
+  public void testEPSG4326BasicReadWriteGeometryValue() throws Exception {

Review Comment:
   Thanks for adding these tests! 
   
   I think we are missing tests in following cases:
   - verify geometry type metadata is well preserved.
   - verify all kinds of geometry stats are preserved, including bbox, covering 
and geometry types.
   - verify geo stats in the column index have been generated.
   
   I can do these later.



##########
parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java:
##########
@@ -1091,6 +1113,128 @@ public int hashCode() {
     }
   }
 
+  /**
+   * Allowed for physical type: BYTE_ARRAY.
+   *
+   * Well-known binary (WKB) representations of geometries. It supports 2D or
+   * 3D geometries of the standard geometry types (Point, LineString, Polygon,
+   * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This
+   * is the preferred option for maximum portability.
+   *
+   * This encoding enables GeometryStatistics to be set in the column chunk
+   * and page index.
+   */
+  public enum GeometryEncoding {
+    WKB
+  }
+
+  /**
+   * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
+   * between points represent a straight cartesian line or the shortest line on
+   * the sphere. Please note that it only applies to polygons.
+   */
+  public enum Edges {
+    PLANAR,
+    SPHERICAL
+  }
+
+  public static class GeometryLogicalTypeAnnotation extends 
LogicalTypeAnnotation {
+    public static final String CRS_ENCODING_DEFAULT = "PROJJSON";
+    private final GeometryEncoding encoding;
+    private final Edges edges;
+    private final String crs;
+    private final String crs_encoding;
+    private final ByteBuffer metadata;
+
+    private GeometryLogicalTypeAnnotation(
+        GeometryEncoding encoding, Edges edges, String crs, String 
crs_encoding, ByteBuffer metadata) {
+      Preconditions.checkArgument(encoding != null, "Geometry encoding is 
required");
+      Preconditions.checkArgument(edges != null, "Geometry edges is required");
+      this.encoding = encoding;
+      this.edges = edges;
+      this.crs = crs;
+      this.crs_encoding = crs_encoding;
+      this.metadata = metadata;
+    }
+
+    @Override
+    @Deprecated
+    public OriginalType toOriginalType() {
+      return null;
+    }
+
+    @Override
+    public <T> Optional<T> accept(LogicalTypeAnnotationVisitor<T> 
logicalTypeAnnotationVisitor) {
+      return logicalTypeAnnotationVisitor.visit(this);
+    }
+
+    @Override
+    LogicalTypeToken getType() {
+      return LogicalTypeToken.GEOMETRY;
+    }
+
+    @Override
+    protected String typeParametersAsString() {
+      StringBuilder sb = new StringBuilder();
+      sb.append("(");
+      sb.append(encoding);
+      sb.append(",");
+      sb.append(edges);
+      if (crs != null && !crs.isEmpty()) {
+        sb.append(",");
+        sb.append(crs);
+      }
+      if (metadata != null) {
+        sb.append(",");
+        sb.append(metadata);

Review Comment:
   My original PR got a review comment from @Kontinuation: 
https://github.com/apache/parquet-java/pull/1379/files#r1650324839
   
   Perhaps we should add a TODO item if we do not have a good answer?



##########
parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestGeometryTypeRoundTrip.java:
##########
@@ -0,0 +1,235 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ */
+package org.apache.parquet.statistics;
+
+import static org.apache.parquet.schema.LogicalTypeAnnotation.geometryType;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+
+import java.io.*;

Review Comment:
   Please avoid import star.



##########
parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java:
##########
@@ -551,7 +600,7 @@ private void addRowGroup(
       ColumnMetaData metaData = new ColumnMetaData(
           getType(columnMetaData.getType()),
           toFormatEncodings(columnMetaData.getEncodings()),
-          columnMetaData.getPath().toList(),
+          Arrays.asList(columnMetaData.getPath().toArray()),

Review Comment:
   ditto



##########
parquet-hadoop/pom.xml:
##########
@@ -203,6 +209,12 @@
       <version>${commons-lang3.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.locationtech.jts</groupId>
+      <artifactId>jts-core</artifactId>
+      <version>${jts.version}</version>
+      <scope>test</scope>

Review Comment:
   Why jts-core has been added twice with different scopes?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to