Copilot commented on code in PR #2665:
URL: https://github.com/apache/sedona/pull/2665#discussion_r2834986961
##########
spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala:
##########
@@ -882,6 +882,100 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ it("GeoParquet auto populates covering metadata for single geometry
column") {
+ val df = sparkSession
+ .range(0, 100)
+ .toDF("id")
+ .withColumn("id", expr("CAST(id AS DOUBLE)"))
+ .withColumn("geometry", expr("ST_Point(id, id + 1)"))
+ .withColumn(
+ "geometry_bbox",
+ expr("struct(id AS xmin, id + 1 AS ymin, id AS xmax, id + 1 AS
ymax)"))
+ val geoParquetSavePath =
Review Comment:
The new behavior to gracefully handle an existing but invalid
`<geometryColumnName>_bbox` struct (log warning + skip) isn’t covered by tests
here. Adding a test that writes a DataFrame with a `geometry_bbox` column
missing required fields or with non-float/double types would help ensure the
write does not fail and that covering metadata is not populated in that case.
##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala:
##########
@@ -240,16 +268,108 @@ class GeoParquetWriteSupport extends
WriteSupport[InternalRow] with Logging {
schema: StructType,
fieldWriters: Array[ValueWriter]): Unit = {
var i = 0
- while (i < row.numFields) {
- if (!row.isNullAt(i)) {
- consumeField(schema(i).name, i) {
- fieldWriters(i).apply(row, i)
- }
+ while (i < schema.length) {
+ generatedCoveringColumnOrdinals.get(i) match {
+ case Some(geometryOrdinal) =>
+ if (!row.isNullAt(geometryOrdinal)) {
+ consumeField(schema(i).name, i) {
+ fieldWriters(i).apply(row, i)
+ }
+ }
+ case None =>
+ if (i < row.numFields && !row.isNullAt(i)) {
+ consumeField(schema(i).name, i) {
+ fieldWriters(i).apply(row, i)
+ }
+ }
}
i += 1
}
}
+ private def maybeAutoGenerateCoveringColumns(): Unit = {
+ if (!isAutoCoveringEnabled) {
+ return
+ }
+
+ // If the user provided any explicit covering options, don't auto-generate
for
+ // the remaining geometry columns. Explicit options signal intentional
configuration.
+ if (geoParquetColumnCoveringMap.nonEmpty) {
+ return
+ }
+
+ val generatedCoveringFields = mutable.ArrayBuffer.empty[StructField]
+ val geometryColumns =
+ geometryColumnInfoMap.keys.toSeq.map(ordinal => ordinal ->
schema(ordinal).name)
+
+ geometryColumns.foreach { case (geometryOrdinal, geometryColumnName) =>
+ if (!geoParquetColumnCoveringMap.contains(geometryColumnName)) {
+ val coveringColumnName = s"${geometryColumnName}_bbox"
+ if (schema.fieldNames.contains(coveringColumnName)) {
+ // Reuse an existing column if it is a valid covering struct;
otherwise skip.
+ try {
+ val covering = createCoveringColumnMetadata(coveringColumnName,
schema)
+ geoParquetColumnCoveringMap.put(geometryColumnName, covering)
+ } catch {
+ case _: IllegalArgumentException =>
+ logWarning(
+ s"Existing column '$coveringColumnName' is not a valid
covering struct " +
+ s"(expected struct<xmin, ymin, xmax, ymax> with float/double
fields). " +
Review Comment:
The warning message here says the expected struct is exactly `struct<xmin,
ymin, xmax, ymax>`, but `createCoveringColumnMetadata` also supports optional
`zmin`/`zmax` fields. Consider adjusting the log message to reflect the actual
accepted schema (required xmin/ymin/xmax/ymax, optional zmin/zmax) so users
aren’t misled when troubleshooting.
```suggestion
s"(expected struct<xmin, ymin, xmax, ymax> with
float/double fields; " +
s"optional zmin/zmax fields are also supported). " +
```
##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala:
##########
@@ -240,16 +268,108 @@ class GeoParquetWriteSupport extends
WriteSupport[InternalRow] with Logging {
schema: StructType,
fieldWriters: Array[ValueWriter]): Unit = {
var i = 0
- while (i < row.numFields) {
- if (!row.isNullAt(i)) {
- consumeField(schema(i).name, i) {
- fieldWriters(i).apply(row, i)
- }
+ while (i < schema.length) {
+ generatedCoveringColumnOrdinals.get(i) match {
+ case Some(geometryOrdinal) =>
+ if (!row.isNullAt(geometryOrdinal)) {
+ consumeField(schema(i).name, i) {
+ fieldWriters(i).apply(row, i)
+ }
+ }
+ case None =>
+ if (i < row.numFields && !row.isNullAt(i)) {
+ consumeField(schema(i).name, i) {
+ fieldWriters(i).apply(row, i)
+ }
+ }
}
i += 1
}
}
+ private def maybeAutoGenerateCoveringColumns(): Unit = {
+ if (!isAutoCoveringEnabled) {
+ return
+ }
+
+ // If the user provided any explicit covering options, don't auto-generate
for
+ // the remaining geometry columns. Explicit options signal intentional
configuration.
+ if (geoParquetColumnCoveringMap.nonEmpty) {
+ return
+ }
+
+ val generatedCoveringFields = mutable.ArrayBuffer.empty[StructField]
+ val geometryColumns =
+ geometryColumnInfoMap.keys.toSeq.map(ordinal => ordinal ->
schema(ordinal).name)
Review Comment:
`geometryColumnInfoMap` is a `mutable.Map` (hash-based), so iterating
`geometryColumnInfoMap.keys.toSeq` can yield a non-deterministic order. That
makes the appended auto-generated `<geom>_bbox` columns (and their ordinals)
potentially vary across runs, which can hurt reproducibility and make
downstream expectations flaky. Consider iterating geometry ordinals in schema
order (e.g., `geometryColumnInfoMap.keys.toSeq.sorted`) when building
`geometryColumns`.
```suggestion
geometryColumnInfoMap.keys.toSeq.sorted.map(ordinal => ordinal ->
schema(ordinal).name)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]