This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 40d1d26dd ORC-1577: Use `ZSTD` as the default compression
40d1d26dd is described below
commit 40d1d26dd7f8618fc79b5dc64676b9ae611c3220
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Mon Jan 8 17:30:10 2024 -0800
ORC-1577: Use `ZSTD` as the default compression
### What changes were proposed in this pull request?
This PR aims to use `ZSTD` as the default compression from Apache ORC 2.0.0.
### Why are the changes needed?
Apache ORC has been supporting ZStandard since 1.6.0.
ZStandard is known to be better than Gzip in terms of the size and speed.
- _The Rise of ZStandard: Apache Spark/Parquet/ORC/Avro_
-
[Slides](https://www.slideshare.net/databricks/the-rise-of-zstandard-apache-sparkparquetorcavro)
- [Youtube](https://youtu.be/dTGxhHwjONY)
### How was this patch tested?
Pass the CIs.
Closes #1733 from dongjoon-hyun/ORC-1577.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit baf4c23557afc06d625532ee98b5d889387ba890)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/Writer.cc | 2 +-
java/core/src/java/org/apache/orc/OrcConf.java | 2 +-
java/core/src/test/org/apache/orc/TestVectorOrcFile.java | 8 ++++----
java/tools/src/test/org/apache/orc/tools/TestFileDump.java | 1 +
site/_docs/core-java-config.md | 4 ++--
site/_docs/hive-config.md | 2 +-
site/_docs/spark-config.md | 2 +-
7 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index e478fc7ac..a98084833 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -51,7 +51,7 @@ namespace orc {
stripeSize = 64 * 1024 * 1024; // 64M
compressionBlockSize = 64 * 1024; // 64K
rowIndexStride = 10000;
- compression = CompressionKind_ZLIB;
+ compression = CompressionKind_ZSTD;
compressionStrategy = CompressionStrategy_SPEED;
memoryPool = getDefaultPool();
paddingTolerance = 0.0;
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java
b/java/core/src/java/org/apache/orc/OrcConf.java
index 900ab56fc..7e5296f52 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -52,7 +52,7 @@ public enum OrcConf {
BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
true,
"Define whether stripes should be padded to the HDFS block boundaries."),
- COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
+ COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZSTD",
"Define the default compression codec for ORC file"),
WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
"Define the version of the file to write. Possible values are 0.11
and\n"+
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index bb6bc9e79..2dacb8d60 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -538,7 +538,7 @@ public class TestVectorOrcFile {
assertEquals(3, stats[1].getNumberOfValues());
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
- assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15",
stats[1].toString());
+ assertEquals("count: 3 hasNull: true bytesOnDisk: 30 sum: 15",
stats[1].toString());
assertEquals(3, stats[2].getNumberOfValues());
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
@@ -1255,7 +1255,7 @@ public class TestVectorOrcFile {
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(),
0.0001);
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(),
0.0001);
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
- assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0
sum: -20.0",
+ assertEquals("count: 2 hasNull: false bytesOnDisk: 19 min: -15.0 max: -5.0
sum: -20.0",
stats[7].toString());
assertEquals("count: 2 hasNull: false bytesOnDisk: " +
@@ -3961,7 +3961,7 @@ public class TestVectorOrcFile {
// test reading with no keys
Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
assertEquals(9 * 1024, reader.getNumberOfRows());
- assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+ assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(0xc00, reader.getCompressionSize());
assertEquals(fileFormat, reader.getFileVersion());
@@ -4107,7 +4107,7 @@ public class TestVectorOrcFile {
reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
- assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
+ assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
assertEquals(0x800, reader.getCompressionSize());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(fileFormat, reader.getFileVersion());
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index a8dc70a99..5db444de0 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -588,6 +588,7 @@ public class TestFileDump {
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.setSchema(schema)
+ .compress(CompressionKind.ZLIB)
.rowIndexStride(1000)
.stripeSize(10000)
.bufferSize(10000));
diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
index 38e0ed16a..5b7996555 100644
--- a/site/_docs/core-java-config.md
+++ b/site/_docs/core-java-config.md
@@ -69,7 +69,7 @@ permalink: /docs/core-java-config.html
</tr>
<tr>
<td><code>orc.compress</code></td>
- <td>ZLIB</td>
+ <td>ZSTD</td>
<td>
Define the default compression codec for ORC file
</td>
@@ -396,4 +396,4 @@ permalink: /docs/core-java-config.html
The maximum number of child elements to buffer before the ORC row writer
writes the batch to the file.
</td>
</tr>
-</table>
\ No newline at end of file
+</table>
diff --git a/site/_docs/hive-config.md b/site/_docs/hive-config.md
index 29dc29dfb..029faa9c6 100644
--- a/site/_docs/hive-config.md
+++ b/site/_docs/hive-config.md
@@ -12,7 +12,7 @@ with the same options.
Key | Default | Notes
:----------------------- | :---------- | :------------------------
-orc.compress | ZLIB | high level compression = {NONE, ZLIB,
SNAPPY, LZO, LZ4, ZSTD}
+orc.compress | ZSTD | high level compression = {NONE, ZLIB,
SNAPPY, LZO, LZ4, ZSTD}
orc.compress.size | 262,144 | compression chunk size
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
orc.row.index.stride | 10,000 | number of rows between index entries
diff --git a/site/_docs/spark-config.md b/site/_docs/spark-config.md
index b8fbb6db0..4d3ba359e 100644
--- a/site/_docs/spark-config.md
+++ b/site/_docs/spark-config.md
@@ -12,7 +12,7 @@ with the same options.
Key | Default | Notes
:----------------------- | :---------- | :------------------------
-orc.compress | ZLIB | high level compression = {NONE, ZLIB,
SNAPPY, LZO, LZ4, ZSTD}
+orc.compress | ZSTD | high level compression = {NONE, ZLIB,
SNAPPY, LZO, LZ4, ZSTD}
orc.compress.size | 262,144 | compression chunk size
orc.stripe.size | 67,108,864 | memory buffer in bytes for writing
orc.row.index.stride | 10,000 | number of rows between index entries