This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new 4b2b8a6889 [GH-2337] Fix the DateTimeParseException in GeoPackage 
reader (#2339)
4b2b8a6889 is described below

commit 4b2b8a68897a22aa929f5a65c8be94c49ee7c6dd
Author: Jia Yu <[email protected]>
AuthorDate: Sun Sep 7 00:43:52 2025 -0700

    [GH-2337] Fix the DateTimeParseException in GeoPackage reader (#2339)
---
 .../transform/DataTypesTransformations.scala       |  41 +++++++++++++---
 .../resources/geopackage/test_datetime_issue.gpkg  | Bin 0 -> 53248 bytes
 .../apache/sedona/sql/GeoPackageReaderTest.scala   |  52 +++++++++++++++++++--
 .../apache/sedona/sql/GeoPackageReaderTest.scala   |  44 +++++++++++++++++
 .../apache/sedona/sql/GeoPackageReaderTest.scala   |  44 +++++++++++++++++
 5 files changed, 170 insertions(+), 11 deletions(-)

diff --git 
a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
 
b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
index 9a23f0a088..c0e532b08a 100644
--- 
a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
+++ 
b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
@@ -18,22 +18,49 @@
  */
 package org.apache.sedona.sql.datasources.geopackage.transform
 
-import java.time.{Instant, LocalDate}
+import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset}
 import java.time.format.DateTimeFormatter
+import java.time.format.DateTimeParseException
 import java.time.temporal.ChronoUnit
 
 object DataTypesTransformations {
-  def getDays(dateString: String): Int = {
-    val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
-
-    val date = LocalDate.parse(dateString, formatter)
+  // Pre-created formatters to avoid repeated object creation
+  private val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
+  private val datetimeFormatters = Array(
+    DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS"), // 3 digits
+    DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SS"), // 2 digits
+    DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.S"), // 1 digit
+    DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss") // no milliseconds
+  )
 
+  def getDays(dateString: String): Int = {
+    val date = LocalDate.parse(dateString, dateFormatter)
     val epochDate = LocalDate.of(1970, 1, 1)
-
     ChronoUnit.DAYS.between(epochDate, date).toInt
   }
 
   def epoch(timestampStr: String): Long = {
-    Instant.parse(timestampStr).toEpochMilli
+    try {
+      // Try parsing as-is first (works for timestamps with timezone info)
+      Instant.parse(timestampStr).toEpochMilli
+    } catch {
+      case _: DateTimeParseException =>
+        // If parsing fails, try treating it as UTC (common case for 
GeoPackage)
+        // Handle various datetime formats without timezone info
+        // Try different patterns to handle various millisecond formats
+        for (formatter <- datetimeFormatters) {
+          try {
+            val localDateTime = LocalDateTime.parse(timestampStr, formatter)
+            return localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli
+          } catch {
+            case _: DateTimeParseException =>
+            // Continue to next formatter
+          }
+        }
+
+        // If all formatters failed, throw a descriptive exception
+        throw new IllegalArgumentException(s"Unable to parse datetime: 
$timestampStr. " +
+          s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]' or 
'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]Z'")
+    }
   }
 }
diff --git 
a/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg 
b/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg
new file mode 100644
index 0000000000..f53a11fe08
Binary files /dev/null and 
b/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg differ
diff --git 
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
 
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index ee9931cbf4..1e4b071361 100644
--- 
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++ 
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -18,11 +18,11 @@
  */
 package org.apache.sedona.sql
 
-import io.minio.{MakeBucketArgs, MinioClient, PutObjectArgs}
-import org.apache.spark.sql.{DataFrame, SparkSession}
+import io.minio.{MakeBucketArgs, MinioClient}
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.expr
 import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.types.{BinaryType, BooleanType, DateType, 
DoubleType, IntegerType, StringType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types._
 import org.scalatest.matchers.should.Matchers
 import org.scalatest.prop.TableDrivenPropertyChecks._
 import org.testcontainers.containers.MinIOContainer
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with 
Matchers {
         df.count() shouldEqual expectedCount
       }
     }
+
+    it("should handle datetime fields without timezone information") {
+      // This test verifies the fix for DateTimeParseException when reading
+      // GeoPackage files with datetime fields that don't include timezone info
+      val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+      // Test reading the test_features table with problematic datetime formats
+      val df = sparkSession.read
+        .format("geopackage")
+        .option("tableName", "test_features")
+        .load(testFilePath)
+
+      // The test should not throw DateTimeParseException when reading 
datetime fields
+      noException should be thrownBy {
+        df.select("created_at", "updated_at").collect()
+      }
+
+      // Verify that datetime fields are properly parsed as TimestampType
+      df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual 
TimestampType
+      df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual 
TimestampType
+
+      // Verify that we can read the datetime values
+      val datetimeValues = df.select("created_at", "updated_at").collect()
+      datetimeValues should not be empty
+
+      // Verify that datetime values are valid timestamps
+      datetimeValues.foreach { row =>
+        val createdTimestamp = row.getAs[Timestamp]("created_at")
+        val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+        createdTimestamp should not be null
+        updatedTimestamp should not be null
+        createdTimestamp.getTime should be > 0L
+        updatedTimestamp.getTime should be > 0L
+      }
+
+      // Test showMetadata option with the same file
+      noException should be thrownBy {
+        val metadataDf = sparkSession.read
+          .format("geopackage")
+          .option("showMetadata", "true")
+          .load(testFilePath)
+        metadataDf.select("last_change").collect()
+      }
+    }
   }
 
   describe("GeoPackage Raster Data Test") {
@@ -257,7 +301,7 @@ class GeoPackageReaderTest extends TestBaseScala with 
Matchers {
         .load(inputPath)
         .count shouldEqual 34
 
-      val df = sparkSessionMinio.read
+      val df = sparkSession.read
         .format("geopackage")
         .option("tableName", "point1")
         .load(inputPath)
diff --git 
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
 
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index 9de19c3c48..6d9f41bf4e 100644
--- 
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++ 
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with 
Matchers {
         df.count() shouldEqual expectedCount
       }
     }
+
+    it("should handle datetime fields without timezone information") {
+      // This test verifies the fix for DateTimeParseException when reading
+      // GeoPackage files with datetime fields that don't include timezone info
+      val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+      // Test reading the test_features table with problematic datetime formats
+      val df = sparkSession.read
+        .format("geopackage")
+        .option("tableName", "test_features")
+        .load(testFilePath)
+
+      // The test should not throw DateTimeParseException when reading 
datetime fields
+      noException should be thrownBy {
+        df.select("created_at", "updated_at").collect()
+      }
+
+      // Verify that datetime fields are properly parsed as TimestampType
+      df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual 
TimestampType
+      df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual 
TimestampType
+
+      // Verify that we can read the datetime values
+      val datetimeValues = df.select("created_at", "updated_at").collect()
+      datetimeValues should not be empty
+
+      // Verify that datetime values are valid timestamps
+      datetimeValues.foreach { row =>
+        val createdTimestamp = row.getAs[Timestamp]("created_at")
+        val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+        createdTimestamp should not be null
+        updatedTimestamp should not be null
+        createdTimestamp.getTime should be > 0L
+        updatedTimestamp.getTime should be > 0L
+      }
+
+      // Test showMetadata option with the same file
+      noException should be thrownBy {
+        val metadataDf = sparkSession.read
+          .format("geopackage")
+          .option("showMetadata", "true")
+          .load(testFilePath)
+        metadataDf.select("last_change").collect()
+      }
+    }
   }
 
   describe("GeoPackage Raster Data Test") {
diff --git 
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
 
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index 9de19c3c48..6d9f41bf4e 100644
--- 
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++ 
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with 
Matchers {
         df.count() shouldEqual expectedCount
       }
     }
+
+    it("should handle datetime fields without timezone information") {
+      // This test verifies the fix for DateTimeParseException when reading
+      // GeoPackage files with datetime fields that don't include timezone info
+      val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+      // Test reading the test_features table with problematic datetime formats
+      val df = sparkSession.read
+        .format("geopackage")
+        .option("tableName", "test_features")
+        .load(testFilePath)
+
+      // The test should not throw DateTimeParseException when reading 
datetime fields
+      noException should be thrownBy {
+        df.select("created_at", "updated_at").collect()
+      }
+
+      // Verify that datetime fields are properly parsed as TimestampType
+      df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual 
TimestampType
+      df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual 
TimestampType
+
+      // Verify that we can read the datetime values
+      val datetimeValues = df.select("created_at", "updated_at").collect()
+      datetimeValues should not be empty
+
+      // Verify that datetime values are valid timestamps
+      datetimeValues.foreach { row =>
+        val createdTimestamp = row.getAs[Timestamp]("created_at")
+        val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+        createdTimestamp should not be null
+        updatedTimestamp should not be null
+        createdTimestamp.getTime should be > 0L
+        updatedTimestamp.getTime should be > 0L
+      }
+
+      // Test showMetadata option with the same file
+      noException should be thrownBy {
+        val metadataDf = sparkSession.read
+          .format("geopackage")
+          .option("showMetadata", "true")
+          .load(testFilePath)
+        metadataDf.select("last_change").collect()
+      }
+    }
   }
 
   describe("GeoPackage Raster Data Test") {

Reply via email to