[spark] branch master updated: [SPARK-45470][SQL] Avoid paste string value of hive orc compression kind

dongjoon Mon, 09 Oct 2023 23:04:35 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f378b506bf1 [SPARK-45470][SQL] Avoid paste string value of hive orc 
compression kind
f378b506bf1 is described below

commit f378b506bf1fc116e5dc4786d786e50d4a56574a
Author: Jiaan Geng <[email protected]>
AuthorDate: Mon Oct 9 23:04:15 2023 -0700

    [SPARK-45470][SQL] Avoid paste string value of hive orc compression kind
    
    ### What changes were proposed in this pull request?
    Currently, Hive supports ORC format with some compression codec( Please 
refer 
[ql/src/java/org/apache/hadoop/hive/ql/io/orc/CompressionKind.java](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/CompressionKind.java)).
    
    Spark pasted many string literal of these compression codec. It is easy to 
make mistakes and reduce development efficiency.
    
    ### Why are the changes needed?
    Avoid paste string value of hive orc compression kind
    
    ### Does this PR introduce _any_ user-facing change?
    'No'.
    Just update inner implementation.
    
    ### How was this patch tested?
    Exists test cases.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    'No'.
    
    Closes #43296 from beliefer/SPARK-45470.
    
    Authored-by: Jiaan Geng <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../org/apache/spark/sql/hive/CompressionCodecSuite.scala  | 14 ++++++++++----
 .../spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala      |  5 +++--
 .../org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala   |  4 +++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
index 6669fbdfbde..a5d11f6e0e1 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala
@@ -23,6 +23,7 @@ import java.util.Locale
 import scala.jdk.CollectionConverters._
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 import org.apache.orc.OrcConf.COMPRESS
 import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.scalatest.BeforeAndAfterAll
@@ -291,8 +292,10 @@ class CompressionCodecSuite extends TestHiveSingleton with 
ParquetTest with Befo
       tableCompressCodecs = List("UNCOMPRESSED", "SNAPPY", "GZIP"),
       sessionCompressCodecs = List("SNAPPY", "GZIP", "SNAPPY"))
     checkForTableWithCompressProp("orc",
-      tableCompressCodecs = List("NONE", "SNAPPY", "ZLIB"),
-      sessionCompressCodecs = List("SNAPPY", "ZLIB", "SNAPPY"))
+      tableCompressCodecs =
+        List(CompressionKind.NONE.name, CompressionKind.SNAPPY.name, 
CompressionKind.ZLIB.name),
+      sessionCompressCodecs =
+        List(CompressionKind.SNAPPY.name, CompressionKind.ZLIB.name, 
CompressionKind.SNAPPY.name))
   }
 
   test("table-level compression is not set but session-level compressions is 
set ") {
@@ -301,7 +304,8 @@ class CompressionCodecSuite extends TestHiveSingleton with 
ParquetTest with Befo
       sessionCompressCodecs = List("UNCOMPRESSED", "SNAPPY", "GZIP"))
     checkForTableWithCompressProp("orc",
       tableCompressCodecs = List.empty,
-      sessionCompressCodecs = List("NONE", "SNAPPY", "ZLIB"))
+      sessionCompressCodecs =
+        List(CompressionKind.NONE.name, CompressionKind.SNAPPY.name, 
CompressionKind.ZLIB.name))
   }
 
   def checkTableWriteWithCompressionCodecs(format: String, compressCodecs: 
List[String]): Unit = {
@@ -336,6 +340,8 @@ class CompressionCodecSuite extends TestHiveSingleton with 
ParquetTest with Befo
 
   test("test table containing mixed compression codec") {
     checkTableWriteWithCompressionCodecs("parquet", List("UNCOMPRESSED", 
"SNAPPY", "GZIP"))
-    checkTableWriteWithCompressionCodecs("orc", List("NONE", "SNAPPY", "ZLIB"))
+    checkTableWriteWithCompressionCodecs(
+      "orc",
+      List(CompressionKind.NONE.name, CompressionKind.SNAPPY.name, 
CompressionKind.ZLIB.name))
   }
 }
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index 3b82a6c458c..e9b6bd28823 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.orc
 import java.io.File
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.catalog.CatalogUtils
@@ -98,7 +99,7 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
       val orcFilePath = maybeOrcFile.get.toPath.toString
       val expectedCompressionKind =
         OrcFileOperator.getFileReader(orcFilePath).get.getCompression
-      assert("ZLIB" === expectedCompressionKind.name())
+      assert(CompressionKind.ZLIB.name() === expectedCompressionKind.name())
 
       val copyDf = spark
         .read
@@ -113,7 +114,7 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest 
{
         .orc(file.getCanonicalPath)
       val expectedCompressionKind =
         OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("SNAPPY" === expectedCompressionKind.name())
+      assert(CompressionKind.SNAPPY.name() === expectedCompressionKind.name())
     }
   }
 }
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
index 9ee9ebc2282..43bcee5348a 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -21,6 +21,8 @@ import java.io.File
 
 import scala.util.Random
 
+import org.apache.hadoop.hive.ql.io.orc.CompressionKind
+
 import org.apache.spark.SparkConf
 import org.apache.spark.benchmark.Benchmark
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -46,7 +48,7 @@ object OrcReadBenchmark extends SqlBasedBenchmark {
 
   override def getSparkSession: SparkSession = {
     val conf = new SparkConf()
-    conf.set("orc.compression", "snappy")
+    conf.set("orc.compression", CompressionKind.SNAPPY.name())
 
     val sparkSession = SparkSession.builder()
       .master("local[1]")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-45470][SQL] Avoid paste string value of hive orc compression kind

Reply via email to