jiayuasu commented on code in PR #2673:
URL: https://github.com/apache/sedona/pull/2673#discussion_r2851396977
##########
spark/common/src/test/scala/org/apache/sedona/sql/rasterIOTest.scala:
##########
@@ -239,5 +248,283 @@ class rasterIOTest extends TestBaseScala with
BeforeAndAfter with GivenWhenThen
}
}
- override def afterAll(): Unit = FileUtils.deleteDirectory(new File(tempDir))
+ describe("Raster read test") {
+ it("should read geotiff using raster source with explicit tiling") {
+ val rasterDf = sparkSession.read
+ .format("raster")
+ .options(Map("retile" -> "true", "tileWidth" -> "64"))
+ .load(rasterdatalocation)
+ assert(rasterDf.count() > 100)
+ rasterDf.collect().foreach { row =>
+ val raster = row.getAs[Object](0).asInstanceOf[GridCoverage2D]
+ assert(raster.getGridGeometry.getGridRange2D.width <= 64)
+ assert(raster.getGridGeometry.getGridRange2D.height <= 64)
+ val x = row.getInt(1)
+ val y = row.getInt(2)
+ assert(x >= 0 && y >= 0)
+ raster.dispose(true)
+ }
+
+ // Test projection push-down
+ rasterDf.selectExpr("y", "rast as r").collect().foreach { row =>
+ val raster = row.getAs[Object](1).asInstanceOf[GridCoverage2D]
+ assert(raster.getGridGeometry.getGridRange2D.width <= 64)
+ assert(raster.getGridGeometry.getGridRange2D.height <= 64)
+ val y = row.getInt(0)
+ assert(y >= 0)
+ raster.dispose(true)
+ }
+ }
+
+ it("should tile geotiff using raster source with padding enabled") {
+ val rasterDf = sparkSession.read
+ .format("raster")
+ .options(Map("retile" -> "true", "tileWidth" -> "64", "padWithNoData"
-> "true"))
+ .load(rasterdatalocation)
+ assert(rasterDf.count() > 100)
+ rasterDf.collect().foreach { row =>
+ val raster = row.getAs[Object](0).asInstanceOf[GridCoverage2D]
+ assert(raster.getGridGeometry.getGridRange2D.width == 64)
+ assert(raster.getGridGeometry.getGridRange2D.height == 64)
+ val x = row.getInt(1)
+ val y = row.getInt(2)
+ assert(x >= 0 && y >= 0)
+ raster.dispose(true)
+ }
+ }
+
+ it("should push down limit and sample to data source") {
+ FileUtils.cleanDirectory(new File(tempDir))
+
+ val sourceDir = new File(rasterdatalocation)
+ val files = sourceDir.listFiles().filter(_.isFile)
+ var numUniqueFiles = 0
+ var numTotalFiles = 0
+ files.foreach { file =>
+ if (file.getPath.endsWith(".tif") || file.getPath.endsWith(".tiff")) {
+ // Create 4 copies for each file
+ for (i <- 0 until 4) {
+ val destFile = new File(tempDir + "/" + file.getName + "_" + i)
+ FileUtils.copyFile(file, destFile)
+ numTotalFiles += 1
+ }
+ numUniqueFiles += 1
+ }
+ }
+
+ val df = sparkSession.read
+ .format("raster")
+ .options(Map("retile" -> "false"))
+ .load(tempDir)
+ .withColumn("width", expr("RS_Width(rast)"))
+
+ val dfWithLimit = df.limit(numUniqueFiles)
+ val plan = queryPlan(dfWithLimit)
+ // Global/local limits are all pushed down to data source
+ assert(plan.collect { case e: LimitExec => e }.isEmpty)
+ assert(dfWithLimit.count() == numUniqueFiles)
+
+ val dfWithSample = df.sample(0.3, seed = 42)
+ val planSample = queryPlan(dfWithSample)
+ // Sample is pushed down to data source
+ assert(planSample.collect { case e: SampleExec => e }.isEmpty)
+ val count = dfWithSample.count()
+ assert(count >= numTotalFiles * 0.1 && count <= numTotalFiles * 0.5)
+
+ val dfWithSampleAndLimit = df.sample(0.5, seed =
42).limit(numUniqueFiles)
+ val planBoth = queryPlan(dfWithSampleAndLimit)
+ assert(planBoth.collect { case e: LimitExec => e }.isEmpty)
+ assert(planBoth.collect { case e: SampleExec => e }.isEmpty)
+ assert(dfWithSampleAndLimit.count() == numUniqueFiles)
+
+ // Limit and sample cannot be fully pushed down when retile is enabled
+ val dfReTiledWithSampleAndLimit = sparkSession.read
+ .format("raster")
+ .options(Map("retile" -> "true"))
+ .load(tempDir)
+ .sample(0.5, seed = 42)
+ .limit(numUniqueFiles)
+ dfReTiledWithSampleAndLimit.explain(true)
Review Comment:
Fixed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]