[GitHub] spark pull request #16781: [SPARK-12297][SQL] Hive compatibility for Parquet...

ueshin Tue, 02 May 2017 23:37:03 -0700

Github user ueshin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16781#discussion_r114478818
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
 ---
    @@ -141,4 +152,373 @@ class ParquetHiveCompatibilitySuite extends 
ParquetCompatibilityTest with TestHi
           Row(Seq(Row(1))),
           "ARRAY<STRUCT<array_element: INT>>")
       }
    +
    +  val testTimezones = Seq(
    +    "UTC" -> "UTC",
    +    "LA" -> "America/Los_Angeles",
    +    "Berlin" -> "Europe/Berlin"
    +  )
    +  // Check creating parquet tables with timestamps, writing data into 
them, and reading it back out
    +  // under a variety of conditions:
    +  // * tables with explicit tz and those without
    +  // * altering table properties directly
    +  // * variety of timezones, local & non-local
    +  val sessionTimezones = testTimezones.map(_._2).map(Some(_)) ++ Seq(None)
    +  sessionTimezones.foreach { sessionTzOpt =>
    +    val sparkSession = spark.newSession()
    +    sessionTzOpt.foreach { tz => 
sparkSession.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, tz) }
    +    testCreateWriteRead(sparkSession, "no_tz", None, sessionTzOpt)
    +    val localTz = TimeZone.getDefault.getID()
    +    testCreateWriteRead(sparkSession, "local", Some(localTz), sessionTzOpt)
    +    // check with a variety of timezones.  The unit tests currently are 
configured to always use
    +    // America/Los_Angeles, but even if they didn't, we'd be sure to cover 
a non-local timezone.
    +    testTimezones.foreach { case (tableName, zone) =>
    +      if (zone != localTz) {
    +        testCreateWriteRead(sparkSession, tableName, Some(zone), 
sessionTzOpt)
    +      }
    +    }
    +  }
    +
    +  private def testCreateWriteRead(
    +      sparkSession: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    testCreateAlterTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +    testWriteTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +    testReadTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +  }
    +
    +  private def checkHasTz(spark: SparkSession, table: String, tz: 
Option[String]): Unit = {
    +    val tableMetadata = 
spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
    +    
assert(tableMetadata.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
 === tz)
    +  }
    +
    +  private def testCreateAlterTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    test(s"SPARK-12297: Create and Alter Parquet tables and timezones; 
explicitTz = $explicitTz; " +
    +      s"sessionTzOpt = $sessionTzOpt") {
    +      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +      withTable(baseTable, s"like_$baseTable", s"select_$baseTable", 
s"partitioned_$baseTable") {
    +        // If we ever add a property to set the table timezone by default, 
defaultTz would change
    +        val defaultTz = None
    +        // check that created tables have correct TBLPROPERTIES
    +        val tblProperties = explicitTz.map {
    +          tz => raw"""TBLPROPERTIES ($key="$tz")"""
    +        }.getOrElse("")
    +        spark.sql(
    +          raw"""CREATE TABLE $baseTable (
    +                |  x int
    +                | )
    +                | STORED AS PARQUET
    +                | $tblProperties
    +            """.stripMargin)
    +        val expectedTableTz = explicitTz.orElse(defaultTz)
    +        checkHasTz(spark, baseTable, expectedTableTz)
    +        spark.sql(
    +          raw"""CREATE TABLE partitioned_$baseTable (
    +                |  x int
    +                | )
    +                | PARTITIONED BY (y int)
    +                | STORED AS PARQUET
    +                | $tblProperties
    +            """.stripMargin)
    +        checkHasTz(spark, s"partitioned_$baseTable", expectedTableTz)
    +        spark.sql(s"CREATE TABLE like_$baseTable LIKE $baseTable")
    +        checkHasTz(spark, s"like_$baseTable", expectedTableTz)
    +        spark.sql(
    +          raw"""CREATE TABLE select_$baseTable
    +                | STORED AS PARQUET
    +                | AS
    +                | SELECT * from $baseTable
    +            """.stripMargin)
    +        checkHasTz(spark, s"select_$baseTable", defaultTz)
    +
    +        // check alter table, setting, unsetting, resetting the property
    +        spark.sql(
    +          raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="America/Los_Angeles")""")
    +        checkHasTz(spark, baseTable, Some("America/Los_Angeles"))
    +        spark.sql(raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="UTC")""")
    +        checkHasTz(spark, baseTable, Some("UTC"))
    +        spark.sql(raw"""ALTER TABLE $baseTable UNSET TBLPROPERTIES 
($key)""")
    +        checkHasTz(spark, baseTable, None)
    +        explicitTz.foreach { tz =>
    +          spark.sql(raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="$tz")""")
    +          checkHasTz(spark, baseTable, expectedTableTz)
    +        }
    +      }
    +    }
    +  }
    +
    +  val desiredTimestampStrings = Seq(
    +    "2015-12-31 22:49:59.123",
    +    "2015-12-31 23:50:59.123",
    +    "2016-01-01 00:39:59.123",
    +    "2016-01-01 01:29:59.123"
    +  )
    +  // We don't want to mess with timezones inside the tests themselves, 
since we use a shared
    +  // spark context, and then we might be prone to issues from lazy vals 
for timezones.  Instead,
    +  // we manually adjust the timezone just to determine what the desired 
millis (since epoch, in utc)
    +  // is for various "wall-clock" times in different timezones, and then we 
can compare against those
    +  // in our tests.
    +  val timestampTimezoneToMillis = {
    +    val originalTz = TimeZone.getDefault
    +    try {
    +      (for {
    +        timestampString <- desiredTimestampStrings
    +        timezone <- Seq("America/Los_Angeles", "Europe/Berlin", "UTC").map 
{
    +          TimeZone.getTimeZone(_)
    +        }
    +      } yield {
    +        TimeZone.setDefault(timezone)
    +        val timestamp = Timestamp.valueOf(timestampString)
    +        (timestampString, timezone.getID()) -> timestamp.getTime()
    +      }).toMap
    +    } finally {
    +      TimeZone.setDefault(originalTz)
    +    }
    +  }
    +
    +  private def createRawData(spark: SparkSession): Dataset[(String, 
Timestamp)] = {
    +    import spark.implicits._
    +    val df = desiredTimestampStrings.toDF("display")
    +    // this will get the millis corresponding to the display time given 
the current *session*
    +    // timezone.
    +    df.withColumn("ts", expr("cast(display as timestamp)")).as[(String, 
Timestamp)]
    +  }
    +
    +  private def testWriteTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]) : Unit = {
    +    val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +    test(s"SPARK-12297: Write to Parquet tables with Timestamps; 
explicitTz = $explicitTz; " +
    +        s"sessionTzOpt = $sessionTzOpt") {
    +
    +      withTable(s"saveAsTable_$baseTable", s"insert_$baseTable", 
s"partitioned_ts_$baseTable") {
    +        val sessionTzId = 
sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
    +        // check that created tables have correct TBLPROPERTIES
    +        val tblProperties = explicitTz.map {
    +          tz => raw"""TBLPROPERTIES ($key="$tz")"""
    +        }.getOrElse("")
    +
    +
    +        val rawData = createRawData(spark)
    +        // Check writing data out.
    +        // We write data into our tables, and then check the raw parquet 
files to see whether
    +        // the correct conversion was applied.
    +        rawData.write.saveAsTable(s"saveAsTable_$baseTable")
    +        checkHasTz(spark, s"saveAsTable_$baseTable", None)
    +        spark.sql(
    +          raw"""CREATE TABLE insert_$baseTable (
    +                |  display string,
    +                |  ts timestamp
    +                | )
    +                | STORED AS PARQUET
    +                | $tblProperties
    +               """.stripMargin)
    +        checkHasTz(spark, s"insert_$baseTable", explicitTz)
    +        rawData.write.insertInto(s"insert_$baseTable")
    +        // no matter what, roundtripping via the table should leave the 
data unchanged
    +        val readFromTable = spark.table(s"insert_$baseTable").collect()
    +          .map { row => (row.getAs[String](0), 
row.getAs[Timestamp](1)).toString() }.sorted
    +        assert(readFromTable === 
rawData.collect().map(_.toString()).sorted)
    +
    +        // Now we load the raw parquet data on disk, and check if it was 
adjusted correctly.
    +        // Note that we only store the timezone in the table property, so 
when we read the
    +        // data this way, we're bypassing all of the conversion logic, and 
reading the raw
    +        // values in the parquet file.
    +        val onDiskLocation = spark.sessionState.catalog
    +          
.getTableMetadata(TableIdentifier(s"insert_$baseTable")).location.getPath
    +        // we test reading the data back with and without the vectorized 
reader, to make sure we
    +        // haven't broken reading parquet from non-hive tables, with both 
readers.
    +        Seq(false, true).foreach { vectorized =>
    +          spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, 
vectorized)
    +          val readFromDisk = spark.read.parquet(onDiskLocation).collect()
    +          val storageTzId = explicitTz.getOrElse(sessionTzId)
    +          readFromDisk.foreach { row =>
    +            val displayTime = row.getAs[String](0)
    +            val millis = row.getAs[Timestamp](1).getTime()
    +            val expectedMillis = timestampTimezoneToMillis((displayTime, 
storageTzId))
    +            assert(expectedMillis === millis, s"Display time 
'$displayTime' was stored " +
    +              s"incorrectly with sessionTz = ${sessionTzOpt}; Got $millis, 
expected " +
    +              s"$expectedMillis (delta = ${millis - expectedMillis})")
    +          }
    +        }
    +
    +        // check tables partitioned by timestamps.  We don't compare the 
"raw" data in this case,
    +        // since they are adjusted even when we bypass the hive table.
    +        
rawData.write.partitionBy("ts").saveAsTable(s"partitioned_ts_$baseTable")
    +        val partitionDiskLocation = spark.sessionState.catalog
    +          
.getTableMetadata(TableIdentifier(s"partitioned_ts_$baseTable")).location.getPath
    +        // no matter what mix of timezones we use, the dirs should specify 
the value with the
    +        // same time we use for display.
    +        val parts = new File(partitionDiskLocation).list().collect {
    +          case name if name.startsWith("ts=") => 
URLDecoder.decode(name.stripPrefix("ts="))
    +        }.toSet
    +        assert(parts === desiredTimestampStrings.toSet)
    +      }
    +    }
    +  }
    +
    +  private def testReadTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    --- End diff --
    
    nit: indent



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #16781: [SPARK-12297][SQL] Hive compatibility for Parquet...

Reply via email to