This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 59aa504c48 [SEDONA-673] Fix issue when loading geoparquet file without
bbox metadata. (#1681)
59aa504c48 is described below
commit 59aa504c48c5879ab138bbbc9c13cdaea00f9443
Author: Paweł Tokaj <[email protected]>
AuthorDate: Fri Nov 15 19:51:23 2024 +0100
[SEDONA-673] Fix issue when loading geoparquet file without bbox metadata.
(#1681)
* Fix issue when loading geoparquet file.
* Fix issue when loading geoparquet file.
---
.../datasources/parquet/GeoParquetSpatialFilter.scala | 4 ++++
.../test/resources/geoparquet/overture/bbox.geoparquet | Bin 0 -> 24496 bytes
.../scala/org/apache/sedona/sql/geoparquetIOTests.scala | 13 +++++++++++++
.../scala/org/apache/sedona/sql/geoparquetIOTests.scala | 13 +++++++++++++
.../scala/org/apache/sedona/sql/geoparquetIOTests.scala | 14 +++++++++++++-
5 files changed, 43 insertions(+), 1 deletion(-)
diff --git
a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/GeoParquetSpatialFilter.scala
b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/GeoParquetSpatialFilter.scala
index 5aa782e5bd..ca932c6b34 100644
---
a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/GeoParquetSpatialFilter.scala
+++
b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/GeoParquetSpatialFilter.scala
@@ -69,6 +69,10 @@ object GeoParquetSpatialFilter {
def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = {
columns.get(columnName).forall { column =>
val bbox = column.bbox
+ if (bbox.isEmpty) {
+ return true
+ }
+
val columnEnvelope =
queryWindow.getFactory.toGeometry(new Envelope(bbox(0), bbox(2),
bbox(1), bbox(3)))
predicateType match {
diff --git
a/spark/common/src/test/resources/geoparquet/overture/bbox.geoparquet
b/spark/common/src/test/resources/geoparquet/overture/bbox.geoparquet
new file mode 100644
index 0000000000..b5393d3309
Binary files /dev/null and
b/spark/common/src/test/resources/geoparquet/overture/bbox.geoparquet differ
diff --git
a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
index ccfd560c84..86549bf71d 100644
---
a/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
+++
b/spark/spark-3.3/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
@@ -54,6 +54,7 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
val legacyparquetdatalocation: String =
resourceFolder + "parquet/legacy-parquet-nested-columns.snappy.parquet"
val geoparquetoutputlocation: String = resourceFolder +
"geoparquet/geoparquet_output/"
+ val overtureBBOX: String = resourceFolder +
"geoparquet/overture/bbox.geoparquet"
override def afterAll(): Unit = FileUtils.deleteDirectory(new
File(geoparquetoutputlocation))
@@ -732,6 +733,18 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ describe("loading one file geoparquet and filtering") {
+ it("should not fail when bbox is not available in geoparquet metadata") {
+ val numberOfRecords = sparkSession.read
+ .format("geoparquet")
+ .load(overtureBBOX)
+ .where("ST_Intersects(geometry, ST_PolygonFromEnvelope(0, 0, 1, 1))")
+ .count()
+
+ assert(numberOfRecords == 9)
+ }
+ }
+
def validateGeoParquetMetadata(path: String)(body: org.json4s.JValue =>
Unit): Unit = {
val parquetFiles = new
File(path).listFiles().filter(_.getName.endsWith(".parquet"))
parquetFiles.foreach { filePath =>
diff --git
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
index f5bd8b486e..274394f3bb 100644
---
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
+++
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
@@ -54,6 +54,7 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
val legacyparquetdatalocation: String =
resourceFolder + "parquet/legacy-parquet-nested-columns.snappy.parquet"
val geoparquetoutputlocation: String = resourceFolder +
"geoparquet/geoparquet_output/"
+ val overtureBBOX: String = resourceFolder +
"geoparquet/overture/bbox.geoparquet"
override def afterAll(): Unit = FileUtils.deleteDirectory(new
File(geoparquetoutputlocation))
@@ -758,6 +759,18 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ describe("loading one file geoparquet and filtering") {
+ it("should not fail when bbox is not available in geoparquet metadata") {
+ val numberOfRecords = sparkSession.read
+ .format("geoparquet")
+ .load(overtureBBOX)
+ .where("ST_Intersects(geometry, ST_PolygonFromEnvelope(0, 0, 1, 1))")
+ .count()
+
+ assert(numberOfRecords == 9)
+ }
+ }
+
def validateGeoParquetMetadata(path: String)(body: org.json4s.JValue =>
Unit): Unit = {
val parquetFiles = new
File(path).listFiles().filter(_.getName.endsWith(".parquet"))
parquetFiles.foreach { filePath =>
diff --git
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
index a6e74730a0..beca265641 100644
---
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
+++
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
@@ -57,7 +57,7 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
val legacyparquetdatalocation: String =
resourceFolder + "parquet/legacy-parquet-nested-columns.snappy.parquet"
val geoparquetoutputlocation: String = resourceFolder +
"geoparquet/geoparquet_output/"
-
+ val overtureBBOX: String = resourceFolder +
"geoparquet/overture/bbox.geoparquet"
override def afterAll(): Unit = FileUtils.deleteDirectory(new
File(geoparquetoutputlocation))
describe("GeoParquet IO tests") {
@@ -761,6 +761,18 @@ class geoparquetIOTests extends TestBaseScala with
BeforeAndAfterAll {
}
}
+ describe("loading one file geoparquet and filtering") {
+ it("should not fail when bbox is not available in geoparquet metadata") {
+ val numberOfRecords = sparkSession.read
+ .format("geoparquet")
+ .load(overtureBBOX)
+ .where("ST_Intersects(geometry, ST_PolygonFromEnvelope(0, 0, 1, 1))")
+ .count()
+
+ assert(numberOfRecords == 9)
+ }
+ }
+
def validateGeoParquetMetadata(path: String)(body: org.json4s.JValue =>
Unit): Unit = {
val parquetFiles = new
File(path).listFiles().filter(_.getName.endsWith(".parquet"))
parquetFiles.foreach { filePath =>