paleolimbot commented on code in PR #240:
URL: https://github.com/apache/parquet-format/pull/240#discussion_r1622564106
##########
src/main/thrift/parquet.thrift:
##########
@@ -237,36 +237,97 @@ struct SizeStatistics {
3: optional list<i64> definition_level_histogram;
}
+/**
+ * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
+ * between points represent a straight cartesian line or the shortest line on
+ * the sphere.
+ */
+enum Edges {
+ PLANAR = 0;
+ SPHERICAL = 1;
+}
+
+/**
+ * A custom WKB-encoded geometry data to be used in geometry statistics.
+ * The geometry may be a polygon to encode an s2 or h3 covering to provide
+ * vendor-agnostic coverings, or an evelope of geometries when a bounding
+ * box cannot be built (e.g. a geometry has spherical edges, or if an edge
+ * of geographic coordinates crosses the antimeridian).
+ */
+struct Geometry {
+ /** Bytes of a WKB-encoded geometry */
+ 1: required binary geometry;
+ /**
+ * Edges of the geometry if it is a polygon. It may be different to the
+ * edges attribute from the GEOMETRY logical type.
+ */
+ 2: optional Edges edges;
+}
+
/**
* Bounding box of geometries in the representation of min/max value pair of
* coordinates from each axis. Values of Z and M are omitted for 2D geometries.
*/
struct BoundingBox {
- 1: optional double x_min;
- 2: optional double x_max;
- 3: optional double y_min;
- 4: optional double y_max;
- 5: optional double z_min;
- 6: optional double z_max;
- 7: optional double m_min;
- 8: optional double m_max;
+ 1: required double xmin;
+ 2: required double xmax;
+ 3: required double ymin;
+ 4: required double ymax;
+ 5: optional double zmin;
+ 6: optional double zmax;
+ 7: optional double mmin;
+ 8: optional double mmax;
}
-/** Statistics specific to GEOMETRY logical type */
-struct GeometryStatistics {
- /** Bounding box of geometries */
- 1: optional BoundingBox bbox;
+union Envelope {
+ 1: BoundingBox bbox // A bounding box of geometries if it can be built.
+ 2: Geometry covering // A covering polygon of geometries if bbox is
unavailable.
+}
+
+/** S2 spatial index: http://s2geometry.io/ */
+struct S2Index {
+ /** Level of S2 cell ids. valid range is [0, 30] */
+ 1: required i32 level;
Review Comment:
It may be worth removing this since it's no longer in the statistics?
Also: To my knowledge, `level` is embedded in the S2/H3 cell ID, and not all
cell IDs in a covering need to have the same level (this is a useful property).
##########
src/main/thrift/parquet.thrift:
##########
@@ -237,36 +237,97 @@ struct SizeStatistics {
3: optional list<i64> definition_level_histogram;
}
+/**
+ * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
+ * between points represent a straight cartesian line or the shortest line on
+ * the sphere.
+ */
+enum Edges {
+ PLANAR = 0;
+ SPHERICAL = 1;
+}
+
+/**
+ * A custom WKB-encoded geometry data to be used in geometry statistics.
+ * The geometry may be a polygon to encode an s2 or h3 covering to provide
+ * vendor-agnostic coverings, or an evelope of geometries when a bounding
+ * box cannot be built (e.g. a geometry has spherical edges, or if an edge
+ * of geographic coordinates crosses the antimeridian).
+ */
+struct Geometry {
+ /** Bytes of a WKB-encoded geometry */
+ 1: required binary geometry;
+ /**
+ * Edges of the geometry if it is a polygon. It may be different to the
+ * edges attribute from the GEOMETRY logical type.
+ */
+ 2: optional Edges edges;
+}
+
/**
* Bounding box of geometries in the representation of min/max value pair of
* coordinates from each axis. Values of Z and M are omitted for 2D geometries.
*/
struct BoundingBox {
- 1: optional double x_min;
- 2: optional double x_max;
- 3: optional double y_min;
- 4: optional double y_max;
- 5: optional double z_min;
- 6: optional double z_max;
- 7: optional double m_min;
- 8: optional double m_max;
+ 1: required double xmin;
+ 2: required double xmax;
+ 3: required double ymin;
+ 4: required double ymax;
+ 5: optional double zmin;
+ 6: optional double zmax;
+ 7: optional double mmin;
+ 8: optional double mmax;
}
-/** Statistics specific to GEOMETRY logical type */
-struct GeometryStatistics {
- /** Bounding box of geometries */
- 1: optional BoundingBox bbox;
+union Envelope {
+ 1: BoundingBox bbox // A bounding box of geometries if it can be built.
+ 2: Geometry covering // A covering polygon of geometries if bbox is
unavailable.
+}
Review Comment:
To avoid overloading various terms that are used elsewhere in the geospatial
world, you probably want:
```
struct Covering {
optional BoundingBox bbox // A bounding box of geometries if it can be
built.
optional Geometry covering // A covering polygon of geometries if bbox
is unavailable.
}
```
Unless there's some technical limitation that providing only one of these is
required, maybe use a `struct`?
(The term `Envelope` is defined somewhere in the simple features/SQL spec as
very specifically the POLYGON representation of the bounding box)
##########
src/main/thrift/parquet.thrift:
##########
@@ -237,36 +237,97 @@ struct SizeStatistics {
3: optional list<i64> definition_level_histogram;
}
+/**
+ * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge
+ * between points represent a straight cartesian line or the shortest line on
+ * the sphere.
+ */
+enum Edges {
+ PLANAR = 0;
+ SPHERICAL = 1;
+}
+
+/**
+ * A custom WKB-encoded geometry data to be used in geometry statistics.
+ * The geometry may be a polygon to encode an s2 or h3 covering to provide
+ * vendor-agnostic coverings, or an evelope of geometries when a bounding
+ * box cannot be built (e.g. a geometry has spherical edges, or if an edge
+ * of geographic coordinates crosses the antimeridian).
+ */
+struct Geometry {
+ /** Bytes of a WKB-encoded geometry */
+ 1: required binary geometry;
+ /**
+ * Edges of the geometry if it is a polygon. It may be different to the
+ * edges attribute from the GEOMETRY logical type.
+ */
+ 2: optional Edges edges;
+}
+
/**
* Bounding box of geometries in the representation of min/max value pair of
* coordinates from each axis. Values of Z and M are omitted for 2D geometries.
*/
struct BoundingBox {
- 1: optional double x_min;
- 2: optional double x_max;
- 3: optional double y_min;
- 4: optional double y_max;
- 5: optional double z_min;
- 6: optional double z_max;
- 7: optional double m_min;
- 8: optional double m_max;
+ 1: required double xmin;
+ 2: required double xmax;
+ 3: required double ymin;
+ 4: required double ymax;
+ 5: optional double zmin;
+ 6: optional double zmax;
+ 7: optional double mmin;
+ 8: optional double mmax;
}
-/** Statistics specific to GEOMETRY logical type */
-struct GeometryStatistics {
- /** Bounding box of geometries */
- 1: optional BoundingBox bbox;
+union Envelope {
+ 1: BoundingBox bbox // A bounding box of geometries if it can be built.
+ 2: Geometry covering // A covering polygon of geometries if bbox is
unavailable.
+}
Review Comment:
Also, if the `optional Geometry covering` is controversial we can drop it 🙂
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]