[GitHub] [spark] MaxGekk commented on a change in pull request #34471: [SPARK-36879][SQL] Support Parquet v2 data page encoding (DELTA_BINARY_PACKED) for the vectorized path

GitBox Thu, 06 Jan 2022 11:57:56 -0800


MaxGekk commented on a change in pull request #34471:
URL: https://github.com/apache/spark/pull/34471#discussion_r779825155




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRebaseDatetimeSuite.scala
##########
@@ -134,33 +134,35 @@ abstract class ParquetRebaseDatetimeSuite
         tsOutputType: String = "TIMESTAMP_MICROS",
         inWriteConf: String = SQLConf.PARQUET_REBASE_MODE_IN_WRITE.key,
         inReadConf: String = SQLConf.PARQUET_REBASE_MODE_IN_READ.key): Unit = {
-      withTempPaths(2) { paths =>
-        paths.foreach(_.delete())
+      withAllParquetWriters {
+        withTempPaths(2) { paths =>
+          paths.foreach(_.delete())
         val oldPath = getResourceParquetFilePath("test-data/" + fileName)
         val path3_x = paths(0).getCanonicalPath
         val path3_x_rebase = paths(1).getCanonicalPath
-        val df = Seq.tabulate(N)(rowFunc).toDF("dict", "plain")
-          .select($"dict".cast(catalystType), $"plain".cast(catalystType))
-        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> tsOutputType) 
{
+          val df = Seq.tabulate(N)(rowFunc).toDF("dict", "plain")
+            .select($"dict".cast(catalystType), $"plain".cast(catalystType))
+          withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
tsOutputType) {

Review comment:
       Something wrong with indentation here and below:
   <img width="642" alt="Screenshot 2022-01-06 at 22 56 02" 
src="https://user-images.githubusercontent.com/1580697/148443305-f6890111-26c2-4538-9abe-53a28144c46b.png";>
   
   

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRebaseDatetimeSuite.scala
##########
@@ -219,60 +221,62 @@ abstract class ParquetRebaseDatetimeSuite
   test("SPARK-31159, SPARK-37705: rebasing timestamps in write") {
     val N = 8
     Seq(false, true).foreach { dictionaryEncoding =>
-      Seq(
-        (
-          "TIMESTAMP_MILLIS",
-          "1001-01-01 01:02:03.123",
-          "1001-01-07 01:09:05.123",
-          SQLConf.PARQUET_REBASE_MODE_IN_WRITE.key,
-          SQLConf.PARQUET_REBASE_MODE_IN_READ.key),
-        (
-          "TIMESTAMP_MICROS",
-          "1001-01-01 01:02:03.123456",
-          "1001-01-07 01:09:05.123456",
-          SQLConf.PARQUET_REBASE_MODE_IN_WRITE.key,
-          SQLConf.PARQUET_REBASE_MODE_IN_READ.key),
-        (
-          "INT96",
-          "1001-01-01 01:02:03.123456",
-          "1001-01-07 01:09:05.123456",
-          SQLConf.PARQUET_INT96_REBASE_MODE_IN_WRITE.key,
-          SQLConf.PARQUET_INT96_REBASE_MODE_IN_READ.key
-        )
-      ).foreach { case (outType, tsStr, nonRebased, inWriteConf, inReadConf) =>
+      withAllParquetWriters {
+        Seq(
+          (
+            "TIMESTAMP_MILLIS",
+            "1001-01-01 01:02:03.123",
+            "1001-01-07 01:09:05.123",
+            SQLConf.PARQUET_REBASE_MODE_IN_WRITE.key,
+            SQLConf.PARQUET_REBASE_MODE_IN_READ.key),
+          (
+            "TIMESTAMP_MICROS",
+            "1001-01-01 01:02:03.123456",
+            "1001-01-07 01:09:05.123456",
+            SQLConf.PARQUET_REBASE_MODE_IN_WRITE.key,
+            SQLConf.PARQUET_REBASE_MODE_IN_READ.key),
+          (
+            "INT96",
+            "1001-01-01 01:02:03.123456",
+            "1001-01-07 01:09:05.123456",
+            SQLConf.PARQUET_INT96_REBASE_MODE_IN_WRITE.key,
+            SQLConf.PARQUET_INT96_REBASE_MODE_IN_READ.key
+          )
+        ).foreach { case (outType, tsStr, nonRebased, inWriteConf, inReadConf) 
=>
         // Ignore the default JVM time zone and use the session time zone 
instead of it in rebasing.
         DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.JST) {
           withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> 
DateTimeTestUtils.LA.getId) {
-            withClue(s"output type $outType") {
-              withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
outType) {
-                withTempPath { dir =>
-                  val path = dir.getAbsolutePath
-                  withSQLConf(inWriteConf -> LEGACY.toString) {
-                    Seq.tabulate(N)(_ => tsStr).toDF("tsS")
-                      .select($"tsS".cast("timestamp").as("ts"))
-                      .repartition(1)
-                      .write
-                      .option("parquet.enable.dictionary", dictionaryEncoding)
-                      .parquet(path)
-                  }
+          withClue(s"output type $outType") {
+            withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> outType) {
+              withTempPath { dir =>
+                val path = dir.getAbsolutePath
+                withSQLConf(inWriteConf -> LEGACY.toString) {
+                  Seq.tabulate(N)(_ => tsStr).toDF("tsS")
+                    .select($"tsS".cast("timestamp").as("ts"))
+                    .repartition(1)
+                    .write
+                    .option("parquet.enable.dictionary", dictionaryEncoding)
+                    .parquet(path)
+                }
 
-                  withAllParquetReaders {
+                withAllParquetReaders {
                     // The file metadata indicates if it needs rebase or not, 
so we can always get
                     // the correct result regardless of the "rebase mode" 
config.
-                    runInMode(inReadConf, Seq(LEGACY, CORRECTED, EXCEPTION)) { 
options =>
-                      checkAnswer(
+                  runInMode(inReadConf, Seq(LEGACY, CORRECTED, EXCEPTION)) { 
options =>
+                    checkAnswer(
                         
spark.read.options(options).parquet(path).select($"ts".cast("string")),
                         Seq.tabulate(N)(_ => Row(tsStr)))
-                    }
+                  }
 
-                    // Force to not rebase to prove the written datetime 
values are rebased
-                    // and we will get wrong result if we don't rebase while 
reading.
-                    withSQLConf("spark.test.forceNoRebase" -> "true") {
-                      checkAnswer(
+                  // Force to not rebase to prove the written datetime values 
are rebased
+                  // and we will get wrong result if we don't rebase while 
reading.
+                  withSQLConf("spark.test.forceNoRebase" -> "true") {
+                    checkAnswer(
                         spark.read.parquet(path).select($"ts".cast("string")),
                         Seq.tabulate(N)(_ => Row(nonRebased)))
                     }
                   }
+                  }

Review comment:
       Wrong indentation too




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] MaxGekk commented on a change in pull request #34471: [SPARK-36879][SQL] Support Parquet v2 data page encoding (DELTA_BINARY_PACKED) for the vectorized path

Reply via email to