[spark] branch branch-3.5 updated: [SPARK-45484][SQL][3.5] Deprecated the incorrect parquet compression codec lz4raw

beliefer Mon, 16 Oct 2023 18:51:27 -0700

This is an automated email from the ASF dual-hosted git repository.

beliefer pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new b2103731bcf [SPARK-45484][SQL][3.5] Deprecated the incorrect parquet 
compression codec lz4raw
b2103731bcf is described below

commit b2103731bcfe7e0bee3b1302c773e46f80badcc9
Author: Jiaan Geng <[email protected]>
AuthorDate: Tue Oct 17 09:50:39 2023 +0800

    [SPARK-45484][SQL][3.5] Deprecated the incorrect parquet compression codec 
lz4raw
    
    ### What changes were proposed in this pull request?
    According to the discussion at 
https://github.com/apache/spark/pull/43310#issuecomment-1757139681, this PR 
want deprecates the incorrect parquet compression codec `lz4raw` at Spark 3.5.1 
and adds a warning log.
    
    The warning log prompts users that `lz4raw` will be removed it at Apache 
Spark 4.0.0.
    
    ### Why are the changes needed?
    Deprecated the incorrect parquet compression codec `lz4raw`.
    
    ### Does this PR introduce _any_ user-facing change?
    'Yes'.
    Users will see the waring log below.
    `Parquet compression codec 'lz4raw' is deprecated, please use 'lz4_raw'`
    
    ### How was this patch tested?
    Exists test cases and new test cases.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    'No'.
    
    Closes #43330 from beliefer/SPARK-45484_3.5.
    
    Authored-by: Jiaan Geng <[email protected]>
    Signed-off-by: Jiaan Geng <[email protected]>
---
 .../org/apache/spark/sql/internal/SQLConf.scala    | 14 ++++++++++--
 .../datasources/parquet/ParquetOptions.scala       |  8 ++++++-
 .../datasources/FileSourceCodecSuite.scala         |  2 +-
 .../ParquetCompressionCodecPrecedenceSuite.scala   | 25 ++++++++++++++++++----
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 73d3756ef6b..427d0480190 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -995,12 +995,22 @@ object SQLConf {
       "`parquet.compression` is specified in the table-specific 
options/properties, the " +
       "precedence would be `compression`, `parquet.compression`, " +
       "`spark.sql.parquet.compression.codec`. Acceptable values include: none, 
uncompressed, " +
-      "snappy, gzip, lzo, brotli, lz4, lz4raw, zstd.")
+      "snappy, gzip, lzo, brotli, lz4, lz4raw, lz4_raw, zstd.")
     .version("1.1.1")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(
-      Set("none", "uncompressed", "snappy", "gzip", "lzo", "brotli", "lz4", 
"lz4raw", "zstd"))
+      Set(
+        "none",
+        "uncompressed",
+        "snappy",
+        "gzip",
+        "lzo",
+        "brotli",
+        "lz4",
+        "lz4raw",
+        "lz4_raw",
+        "zstd"))
     .createWithDefault("snappy")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = 
buildConf("spark.sql.parquet.filterPushdown")
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index 023d2460959..95869b6fbb9 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -22,6 +22,7 @@ import java.util.Locale
 import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.{DataSourceOptions, FileSourceOptions}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.internal.SQLConf
@@ -32,7 +33,7 @@ import org.apache.spark.sql.internal.SQLConf
 class ParquetOptions(
     @transient private val parameters: CaseInsensitiveMap[String],
     @transient private val sqlConf: SQLConf)
-  extends FileSourceOptions(parameters) {
+  extends FileSourceOptions(parameters) with Logging {
 
   import ParquetOptions._
 
@@ -59,6 +60,9 @@ class ParquetOptions(
       throw new IllegalArgumentException(s"Codec [$codecName] " +
         s"is not available. Available codecs are ${availableCodecs.mkString(", 
")}.")
     }
+    if (codecName == "lz4raw") {
+      log.warn("Parquet compression codec 'lz4raw' is deprecated, please use 
'lz4_raw'")
+    }
     shortParquetCompressionCodecNames(codecName).name()
   }
 
@@ -96,7 +100,9 @@ object ParquetOptions extends DataSourceOptions {
     "lzo" -> CompressionCodecName.LZO,
     "brotli" -> CompressionCodecName.BROTLI,
     "lz4" -> CompressionCodecName.LZ4,
+    // Deprecated, to be removed at Spark 4.0.0, please use 'lz4_raw' instead.
     "lz4raw" -> CompressionCodecName.LZ4_RAW,
+    "lz4_raw" -> CompressionCodecName.LZ4_RAW,
     "zstd" -> CompressionCodecName.ZSTD)
 
   def getParquetCompressionCodecName(name: String): String = {
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
index 09a348cd294..9f3d6ff48d4 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
@@ -59,7 +59,7 @@ class ParquetCodecSuite extends FileSourceCodecSuite {
   // Exclude "brotli" because the com.github.rdblue:brotli-codec dependency is 
not available
   // on Maven Central.
   override protected def availableCodecs: Seq[String] = {
-    Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4", "lz4raw")
+    Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4", "lz4raw", 
"lz4_raw")
   }
 }
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
index ac0aad16f1e..27e2816ce9d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
@@ -29,9 +29,23 @@ import org.apache.spark.sql.test.SharedSparkSession
 
 class ParquetCompressionCodecPrecedenceSuite extends ParquetTest with 
SharedSparkSession {
   test("Test `spark.sql.parquet.compression.codec` config") {
-    Seq("NONE", "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO", "LZ4", "BROTLI", 
"ZSTD").foreach { c =>
+    Seq(
+      "NONE",
+      "UNCOMPRESSED",
+      "SNAPPY",
+      "GZIP",
+      "LZO",
+      "LZ4",
+      "BROTLI",
+      "ZSTD",
+      "LZ4RAW",
+      "LZ4_RAW").foreach { c =>
       withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> c) {
-        val expected = if (c == "NONE") "UNCOMPRESSED" else c
+        val expected = c match {
+          case "NONE" => "UNCOMPRESSED"
+          case "LZ4RAW" => "LZ4_RAW"
+          case other => other
+        }
         val option = new ParquetOptions(Map.empty[String, String], 
spark.sessionState.conf)
         assert(option.compressionCodecClassName == expected)
       }
@@ -97,7 +111,10 @@ class ParquetCompressionCodecPrecedenceSuite extends 
ParquetTest with SharedSpar
         createTableWithCompression(tempTableName, isPartitioned, 
compressionCodec, tmpDir)
         val partitionPath = if (isPartitioned) "p=2" else ""
         val path = 
s"${tmpDir.getPath.stripSuffix("/")}/$tempTableName/$partitionPath"
-        val realCompressionCodecs = getTableCompressionCodec(path)
+        val realCompressionCodecs = getTableCompressionCodec(path).map {
+          case "LZ4_RAW" if compressionCodec == "LZ4RAW" => "LZ4RAW"
+          case other => other
+        }
         assert(realCompressionCodecs.forall(_ == compressionCodec))
       }
     }
@@ -105,7 +122,7 @@ class ParquetCompressionCodecPrecedenceSuite extends 
ParquetTest with SharedSpar
 
   test("Create parquet table with compression") {
     Seq(true, false).foreach { isPartitioned =>
-      val codecs = Seq("UNCOMPRESSED", "SNAPPY", "GZIP", "ZSTD", "LZ4")
+      val codecs = Seq("UNCOMPRESSED", "SNAPPY", "GZIP", "ZSTD", "LZ4", 
"LZ4RAW", "LZ4_RAW")
       codecs.foreach { compressionCodec =>
         checkCompressionCodec(compressionCodec, isPartitioned)
       }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch branch-3.5 updated: [SPARK-45484][SQL][3.5] Deprecated the incorrect parquet compression codec lz4raw

Reply via email to