Copilot commented on code in PR #2921:
URL: https://github.com/apache/sedona/pull/2921#discussion_r3199421646
##########
spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala:
##########
@@ -1021,6 +1021,53 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ it("GeoParquet supports writing covering metadata from a Box2D column") {
+ // User-provided Box2D column referenced via the geoparquet.covering
option.
+ val df = sparkSession
+ .range(0, 100)
+ .toDF("id")
+ .withColumn("id", expr("CAST(id AS DOUBLE)"))
+ .withColumn("geometry", expr("ST_Point(id, id + 1)"))
+ .withColumn("test_cov", expr("ST_Box2D(geometry)"))
+ val geoParquetSavePath = geoparquetoutputlocation +
"/gp_with_box2d_covering.parquet"
+ df.write
+ .format("geoparquet")
+ .option("geoparquet.covering.geometry", "test_cov")
+ .mode("overwrite")
+ .save(geoParquetSavePath)
Review Comment:
Both tests write to fixed, hard-coded output paths under
`geoparquetoutputlocation`. If the test runner executes specs concurrently (or
if prior runs leave artifacts), this can cause intermittent collisions despite
`mode(\"overwrite\")` (e.g., Windows file locking, concurrent overwrite, or
reader/writer overlap). Use a per-test unique directory (UUID / test name +
timestamp) and ensure it’s cleaned up after the assertion to keep the suite
robust under parallelism and retries.
##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala:
##########
@@ -236,6 +237,14 @@ object GeoParquetMetaData {
schema(coveringColumnIndex).dataType match {
case coveringColumnType: StructType =>
coveringColumnTypeToCovering(coveringColumnName, coveringColumnType)
+ case _: Box2DUDT =>
+ // Box2DUDT exposes a struct<xmin, ymin, xmax, ymax: double> sqlType,
which is the exact
+ // shape required by GeoParquet 1.1 bbox covering columns. Treat the
underlying struct as
+ // the covering struct so users can write a Box2D column and have it
referenced as a
+ // covering column in GeoParquet metadata without any manual struct
construction.
+ coveringColumnTypeToCovering(
+ coveringColumnName,
+ new Box2DUDT().sqlType.asInstanceOf[StructType])
case _ =>
Review Comment:
This creates a new `Box2DUDT` instance and relies on
`asInstanceOf[StructType]`. Since the match already knows the column is a
`Box2DUDT`, prefer binding the instance (e.g., `case udt: Box2DUDT =>`) and
using `udt.sqlType` with a safe match to `StructType`, which avoids unnecessary
allocation and prevents a potential runtime `ClassCastException` if the UDT’s
`sqlType` ever changes. Consider generalizing this path to any
`UserDefinedType` whose `sqlType` is a `StructType` to avoid future one-off
special cases.
##########
spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala:
##########
@@ -1021,6 +1021,53 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ it("GeoParquet supports writing covering metadata from a Box2D column") {
+ // User-provided Box2D column referenced via the geoparquet.covering
option.
+ val df = sparkSession
+ .range(0, 100)
+ .toDF("id")
+ .withColumn("id", expr("CAST(id AS DOUBLE)"))
+ .withColumn("geometry", expr("ST_Point(id, id + 1)"))
+ .withColumn("test_cov", expr("ST_Box2D(geometry)"))
+ val geoParquetSavePath = geoparquetoutputlocation +
"/gp_with_box2d_covering.parquet"
+ df.write
+ .format("geoparquet")
+ .option("geoparquet.covering.geometry", "test_cov")
+ .mode("overwrite")
+ .save(geoParquetSavePath)
+ validateGeoParquetMetadata(geoParquetSavePath) { geo =>
+ implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats
+ val coveringJsValue = geo \ "columns" \ "geometry" \ "covering"
+ val covering = coveringJsValue.extract[Covering]
+ assert(covering.bbox.xmin == Seq("test_cov", "xmin"))
+ assert(covering.bbox.ymin == Seq("test_cov", "ymin"))
+ assert(covering.bbox.xmax == Seq("test_cov", "xmax"))
+ assert(covering.bbox.ymax == Seq("test_cov", "ymax"))
+ }
+ }
+
+ it("GeoParquet auto populates covering metadata for a Box2D <geom>_bbox
column") {
+ // Auto-detect path: when a column named <geom>_bbox is a Box2D, reuse
it as the
+ // covering column instead of synthesizing a separate float64 struct.
+ val df = sparkSession
+ .range(0, 100)
+ .toDF("id")
+ .withColumn("id", expr("CAST(id AS DOUBLE)"))
+ .withColumn("geometry", expr("ST_Point(id, id + 1)"))
+ .withColumn("geometry_bbox", expr("ST_Box2D(geometry)"))
+ val geoParquetSavePath = geoparquetoutputlocation +
"/gp_box2d_auto_covering.parquet"
+ df.write.format("geoparquet").mode("overwrite").save(geoParquetSavePath)
Review Comment:
Both tests write to fixed, hard-coded output paths under
`geoparquetoutputlocation`. If the test runner executes specs concurrently (or
if prior runs leave artifacts), this can cause intermittent collisions despite
`mode(\"overwrite\")` (e.g., Windows file locking, concurrent overwrite, or
reader/writer overlap). Use a per-test unique directory (UUID / test name +
timestamp) and ensure it’s cleaned up after the assertion to keep the suite
robust under parallelism and retries.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]