This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new baf4c2355 ORC-1577: Use `ZSTD` as the default compression
baf4c2355 is described below

commit baf4c23557afc06d625532ee98b5d889387ba890
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Mon Jan 8 17:30:10 2024 -0800

    ORC-1577: Use `ZSTD` as the default compression
    
    ### What changes were proposed in this pull request?
    
    This PR aims to use `ZSTD` as the default compression from Apache ORC 2.0.0.
    
    ### Why are the changes needed?
    
    Apache ORC has been supporting ZStandard since 1.6.0.
    
    ZStandard is known to be better than Gzip in terms of the size and speed.
    
    - _The Rise of ZStandard: Apache Spark/Parquet/ORC/Avro_
        - 
[Slides](https://www.slideshare.net/databricks/the-rise-of-zstandard-apache-sparkparquetorcavro)
        - [Youtube](https://youtu.be/dTGxhHwjONY)
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    Closes #1733 from dongjoon-hyun/ORC-1577.
    
    Authored-by: Dongjoon Hyun <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/src/Writer.cc                                          | 2 +-
 java/core/src/java/org/apache/orc/OrcConf.java             | 2 +-
 java/core/src/test/org/apache/orc/TestVectorOrcFile.java   | 8 ++++----
 java/tools/src/test/org/apache/orc/tools/TestFileDump.java | 1 +
 site/_docs/core-java-config.md                             | 4 ++--
 site/_docs/hive-config.md                                  | 2 +-
 site/_docs/spark-config.md                                 | 2 +-
 7 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index e478fc7ac..a98084833 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -51,7 +51,7 @@ namespace orc {
       stripeSize = 64 * 1024 * 1024;                               // 64M
       compressionBlockSize = 64 * 1024;                            // 64K
       rowIndexStride = 10000;
-      compression = CompressionKind_ZLIB;
+      compression = CompressionKind_ZSTD;
       compressionStrategy = CompressionStrategy_SPEED;
       memoryPool = getDefaultPool();
       paddingTolerance = 0.0;
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java 
b/java/core/src/java/org/apache/orc/OrcConf.java
index 900ab56fc..7e5296f52 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -52,7 +52,7 @@ public enum OrcConf {
   BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
       true,
       "Define whether stripes should be padded to the HDFS block boundaries."),
-  COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
+  COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZSTD",
       "Define the default compression codec for ORC file"),
   WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
       "Define the version of the file to write. Possible values are 0.11 
and\n"+
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java 
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index bb6bc9e79..2dacb8d60 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -538,7 +538,7 @@ public class TestVectorOrcFile {
 
     assertEquals(3, stats[1].getNumberOfValues());
     assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
-    assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", 
stats[1].toString());
+    assertEquals("count: 3 hasNull: true bytesOnDisk: 30 sum: 15", 
stats[1].toString());
 
     assertEquals(3, stats[2].getNumberOfValues());
     assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
@@ -1255,7 +1255,7 @@ public class TestVectorOrcFile {
     assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 
0.0001);
     assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 
0.0001);
     assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
-    assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 
sum: -20.0",
+    assertEquals("count: 2 hasNull: false bytesOnDisk: 19 min: -15.0 max: -5.0 
sum: -20.0",
         stats[7].toString());
 
     assertEquals("count: 2 hasNull: false bytesOnDisk: " +
@@ -3961,7 +3961,7 @@ public class TestVectorOrcFile {
     // test reading with no keys
     Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
     assertEquals(9 * 1024, reader.getNumberOfRows());
-    assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+    assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
     assertEquals(1000, reader.getRowIndexStride());
     assertEquals(0xc00, reader.getCompressionSize());
     assertEquals(fileFormat, reader.getFileVersion());
@@ -4107,7 +4107,7 @@ public class TestVectorOrcFile {
 
     reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
     assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
-    assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+    assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
     assertEquals(0x800, reader.getCompressionSize());
     assertEquals(1000, reader.getRowIndexStride());
     assertEquals(fileFormat, reader.getFileVersion());
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java 
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index a8dc70a99..5db444de0 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -588,6 +588,7 @@ public class TestFileDump {
     Writer writer = OrcFile.createWriter(testFilePath,
         OrcFile.writerOptions(conf)
             .setSchema(schema)
+            .compress(CompressionKind.ZLIB)
             .rowIndexStride(1000)
             .stripeSize(10000)
             .bufferSize(10000));
diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
index 38e0ed16a..5b7996555 100644
--- a/site/_docs/core-java-config.md
+++ b/site/_docs/core-java-config.md
@@ -69,7 +69,7 @@ permalink: /docs/core-java-config.html
 </tr>
 <tr>
   <td><code>orc.compress</code></td>
-  <td>ZLIB</td>
+  <td>ZSTD</td>
   <td>
     Define the default compression codec for ORC file
   </td>
@@ -396,4 +396,4 @@ permalink: /docs/core-java-config.html
     The maximum number of child elements to buffer before the ORC row writer 
writes the batch to the file.
   </td>
 </tr>
-</table>
\ No newline at end of file
+</table>
diff --git a/site/_docs/hive-config.md b/site/_docs/hive-config.md
index 29dc29dfb..029faa9c6 100644
--- a/site/_docs/hive-config.md
+++ b/site/_docs/hive-config.md
@@ -12,7 +12,7 @@ with the same options.
 
 Key                      | Default     | Notes
 :----------------------- | :---------- | :------------------------
-orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, 
SNAPPY, LZO, LZ4, ZSTD}
+orc.compress             | ZSTD        | high level compression = {NONE, ZLIB, 
SNAPPY, LZO, LZ4, ZSTD}
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries
diff --git a/site/_docs/spark-config.md b/site/_docs/spark-config.md
index b8fbb6db0..4d3ba359e 100644
--- a/site/_docs/spark-config.md
+++ b/site/_docs/spark-config.md
@@ -12,7 +12,7 @@ with the same options.
 
 Key                      | Default     | Notes
 :----------------------- | :---------- | :------------------------
-orc.compress             | ZLIB        | high level compression = {NONE, ZLIB, 
SNAPPY, LZO, LZ4, ZSTD}
+orc.compress             | ZSTD        | high level compression = {NONE, ZLIB, 
SNAPPY, LZO, LZ4, ZSTD}
 orc.compress.size        | 262,144     | compression chunk size
 orc.stripe.size          | 67,108,864  | memory buffer in bytes for writing
 orc.row.index.stride     | 10,000      | number of rows between index entries

Reply via email to