Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/16781#discussion_r111891389
--- Diff:
sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
---
@@ -141,4 +160,325 @@ class ParquetHiveCompatibilitySuite extends
ParquetCompatibilityTest with TestHi
Row(Seq(Row(1))),
"ARRAY<STRUCT<array_element: INT>>")
}
+
+ val testTimezones = Seq(
+ "UTC" -> "UTC",
+ "LA" -> "America/Los_Angeles",
+ "Berlin" -> "Europe/Berlin"
+ )
+ // Check creating parquet tables with timestamps, writing data into
them, and reading it back out
+ // under a variety of conditions:
+ // * tables with explicit tz and those without
+ // * altering table properties directly
+ // * variety of timezones, local & non-local
+ val sessionTimezones = testTimezones.map(_._2).map(Some(_)) ++ Seq(None)
+ sessionTimezones.foreach { sessionTzOpt =>
+ val sparkSession = spark.newSession()
+ sessionTzOpt.foreach { tz =>
sparkSession.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, tz) }
+ testCreateWriteRead(sparkSession, "no_tz", None, sessionTzOpt)
+ val localTz = TimeZone.getDefault.getID()
+ testCreateWriteRead(sparkSession, "local", Some(localTz), sessionTzOpt)
+ // check with a variety of timezones. The unit tests currently are
configured to always use
+ // America/Los_Angeles, but even if they didn't, we'd be sure to cover
a non-local timezone.
+ Seq(
+ "UTC" -> "UTC",
+ "LA" -> "America/Los_Angeles",
+ "Berlin" -> "Europe/Berlin"
+ ).foreach { case (tableName, zone) =>
+ if (zone != localTz) {
+ testCreateWriteRead(sparkSession, tableName, Some(zone),
sessionTzOpt)
+ }
+ }
+ }
+
+ private def testCreateWriteRead(
+ sparkSession: SparkSession,
+ baseTable: String,
+ explicitTz: Option[String],
+ sessionTzOpt: Option[String]): Unit = {
+ testCreateAlterTablesWithTimezone(sparkSession, baseTable, explicitTz,
sessionTzOpt)
+ testWriteTablesWithTimezone(sparkSession, baseTable, explicitTz,
sessionTzOpt)
+ testReadTablesWithTimezone(sparkSession, baseTable, explicitTz,
sessionTzOpt)
+ }
+
+ private def checkHasTz(table: String, tz: Option[String]): Unit = {
+ val tableMetadata =
spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
+
assert(tableMetadata.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
=== tz)
+ }
+
+ private def testCreateAlterTablesWithTimezone(
+ spark: SparkSession,
+ baseTable: String,
+ explicitTz: Option[String],
+ sessionTzOpt: Option[String]): Unit = {
+ test(s"SPARK-12297: Create and Alter Parquet tables and timezones;
explicitTz = $explicitTz; " +
+ s"sessionTzOpt = $sessionTzOpt") {
+ val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+ withTable(baseTable, s"like_$baseTable", s"select_$baseTable") {
+ val localTz = TimeZone.getDefault()
+ val localTzId = localTz.getID()
+ // If we ever add a property to set the table timezone by default,
defaultTz would change
+ val defaultTz = None
+ // check that created tables have correct TBLPROPERTIES
+ val tblProperties = explicitTz.map {
+ tz => raw"""TBLPROPERTIES ($key="$tz")"""
+ }.getOrElse("")
+ spark.sql(
+ raw"""CREATE TABLE $baseTable (
+ | x int
+ | )
+ | STORED AS PARQUET
+ | $tblProperties
+ """.stripMargin)
+ val expectedTableTz = explicitTz.orElse(defaultTz)
+ checkHasTz(baseTable, expectedTableTz)
+ spark.sql(s"CREATE TABLE like_$baseTable LIKE $baseTable")
+ checkHasTz(s"like_$baseTable", expectedTableTz)
+ spark.sql(
+ raw"""CREATE TABLE select_$baseTable
+ | STORED AS PARQUET
+ | AS
+ | SELECT * from $baseTable
+ """.stripMargin)
+ checkHasTz(s"select_$baseTable", defaultTz)
+
+ // check alter table, setting, unsetting, resetting the property
+ spark.sql(
+ raw"""ALTER TABLE $baseTable SET TBLPROPERTIES
($key="America/Los_Angeles")""")
+ checkHasTz(baseTable, Some("America/Los_Angeles"))
+ spark.sql( raw"""ALTER TABLE $baseTable SET TBLPROPERTIES
($key="UTC")""")
+ checkHasTz(baseTable, Some("UTC"))
+ spark.sql( raw"""ALTER TABLE $baseTable UNSET TBLPROPERTIES
($key)""")
+ checkHasTz(baseTable, None)
+ explicitTz.foreach { tz =>
+ spark.sql( raw"""ALTER TABLE $baseTable SET TBLPROPERTIES
($key="$tz")""")
+ checkHasTz(baseTable, expectedTableTz)
+ }
+ }
+ }
+ }
+
+ val desiredTimestampStrings = Seq(
+ "2015-12-31 23:50:59.123",
+ "2015-12-31 22:49:59.123",
+ "2016-01-01 00:39:59.123",
+ "2016-01-01 01:29:59.123"
+ )
+ // We don't want to mess with timezones inside the tests themselves,
since we use a shared
+ // spark context, and then we might be prone to issues from lazy vals
for timezones. Instead,
+ // we manually adjust the timezone just to determine what the desired
millis (since epoch, in utc)
+ // is for various "wall-clock" times in different timezones, and then we
can compare against those
+ // in our tests.
+ val originalTz = TimeZone.getDefault
+ val timestampTimezoneToMillis = try {
+ (for {
+ timestampString <- desiredTimestampStrings
+ timezone <- Seq("America/Los_Angeles", "Europe/Berlin", "UTC").map {
+ TimeZone.getTimeZone(_)
+ }
+ } yield {
+ TimeZone.setDefault(timezone)
+ val timestamp = Timestamp.valueOf(timestampString)
+ (timestampString, timezone.getID()) -> timestamp.getTime()
+ }).toMap
+ } finally {
+ TimeZone.setDefault(originalTz)
+ }
+
+ private def createRawData(spark: SparkSession): Dataset[(String,
Timestamp)] = {
+ val originalTsStrings = Seq(
+ "2015-12-31 22:49:59.123",
+ "2015-12-31 23:50:59.123",
+ "2016-01-01 00:39:59.123",
+ "2016-01-01 01:29:59.123"
+ )
+ val rowRdd = spark.sparkContext.parallelize(originalTsStrings,
1).map(Row(_))
+ val schema = StructType(Seq(
+ StructField("display", StringType, true)
+ ))
+ val df = spark.createDataFrame(rowRdd, schema)
+ // this will get the millis corresponding to the display time given
the current *session*
+ // timezone.
+ import spark.implicits._
+ df.withColumn("ts", expr("cast(display as timestamp)")).map { row =>
+ (row.getAs[String](0), row.getAs[Timestamp](1))
+ }
--- End diff --
nit: `df.withColumn(...).as[(String, Timestamp)]`
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]