This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new ca0cb0494 ORC-1866: Avoid zlib decompression infinite loop
ca0cb0494 is described below
commit ca0cb04948d9142cb0bb6c787e85e9934b02db1c
Author: sychen <[email protected]>
AuthorDate: Mon Mar 31 07:13:10 2025 +0900
ORC-1866: Avoid zlib decompression infinite loop
### What changes were proposed in this pull request?
This PR aims to fix ZlibCodec decompression of damaged files can be fast
fail.
### Why are the changes needed?
This is a long-standing issue. The decompress method implemented by
ZlibCodec may enter an infinite loop when encountering some corrupt files.
jstack
```java
"main" #1 [4611] prio=5 os_prio=31 cpu=55921.47ms elapsed=57.53s
tid=0x0000000139014600 nid=4611 runnable [0x000000016d9fa000]
java.lang.Thread.State: RUNNABLE
at java.util.zip.Inflater.inflateBytesBytes(java.base21.0.5/Native
Method)
at java.util.zip.Inflater.inflate(java.base21.0.5/Inflater.java:376)
- locked <0x00000004367befc0> (a
java.util.zip.Inflater$InflaterZStreamRef)
at org.apache.orc.impl.ZlibCodec.decompress(ZlibCodec.java:168)
at
org.apache.orc.impl.InStream$CompressedStream.readHeader(InStream.java:521)
at
org.apache.orc.impl.InStream$CompressedStream.ensureUncompressed(InStream.java:548)
at
org.apache.orc.impl.InStream$CompressedStream.read(InStream.java:535)
at
org.apache.orc.impl.TreeReaderFactory$BytesColumnVectorUtil.commonReadByteArrays(TreeReaderFactory.java:2052)
at
org.apache.orc.impl.TreeReaderFactory$BytesColumnVectorUtil.readOrcByteArrays(TreeReaderFactory.java:2071)
at
org.apache.orc.impl.TreeReaderFactory$StringDirectTreeReader.nextVector(TreeReaderFactory.java:2169)
at
org.apache.orc.impl.TreeReaderFactory$StringTreeReader.nextVector(TreeReaderFactory.java:2001)
at
org.apache.orc.impl.reader.tree.StructBatchReader.readBatchColumn(StructBatchReader.java:65)
at
org.apache.orc.impl.reader.tree.StructBatchReader.nextBatchForLevel(StructBatchReader.java:100)
at
org.apache.orc.impl.reader.tree.StructBatchReader.nextBatch(StructBatchReader.java:77)
at
org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1432)
at org.apache.orc.tools.PrintData.printJsonData(PrintData.java:208)
at org.apache.orc.tools.PrintData.main(PrintData.java:288)
at org.apache.orc.tools.Driver.main(Driver.java:120)
```
### How was this patch tested?
1. local test
2. Add UT
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #2127 from cxzl25/zlib_infinite_loop.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 8eaf92d4b586fa8dda415a45ccb0801c6cc034d9)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/java/org/apache/orc/impl/ZlibCodec.java | 11 ++++++++
.../src/test/org/apache/orc/impl/TestZlib.java | 28 +++++++++++++++++++++
java/core/src/test/resources/orc_corrupt_zlib.orc | Bin 0 -> 301 bytes
3 files changed, 39 insertions(+)
diff --git a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
index 398ac0d16..d4275a4c2 100644
--- a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
+++ b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
@@ -169,6 +169,17 @@ public class ZlibCodec implements CompressionCodec,
DirectDecompressionCodec {
out.arrayOffset() + out.position(),
out.remaining());
out.position(count + out.position());
+
+ if (!inflater.finished() && !inflater.needsDictionary() &&
!inflater.needsInput() &&
+ count == 0) {
+ if (out.remaining() == 0) {
+ throw new IOException("Decompress output buffer too small. in =
" + in +
+ ", out = " + out);
+ } else {
+ throw new IOException("Decompress error. in = " + in +
+ ", out = " + out);
+ }
+ }
} catch (DataFormatException dfe) {
throw new IOException("Bad compression data", dfe);
}
diff --git a/java/core/src/test/org/apache/orc/impl/TestZlib.java
b/java/core/src/test/org/apache/orc/impl/TestZlib.java
index 4ca62ca2a..6e940923e 100644
--- a/java/core/src/test/org/apache/orc/impl/TestZlib.java
+++ b/java/core/src/test/org/apache/orc/impl/TestZlib.java
@@ -18,13 +18,21 @@
package org.apache.orc.impl;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.CompressionCodec;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.ByteBuffer;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
public class TestZlib {
@@ -54,4 +62,24 @@ public class TestZlib {
// EXPECTED
}
}
+
+ @Test
+ public void testCorruptZlibFile() {
+ Configuration conf = new Configuration();
+ Path testFilePath = new Path(ClassLoader.
+ getSystemResource("orc_corrupt_zlib.orc").getPath());
+
+ IOException exception = assertThrows(
+ IOException.class,
+ () -> {
+ try (Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf))) {
+ RecordReader rows = reader.rows();
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ }
+ }
+ }
+ );
+ assertTrue(exception.getMessage().contains("Decompress output buffer too
small"));
+ }
}
diff --git a/java/core/src/test/resources/orc_corrupt_zlib.orc
b/java/core/src/test/resources/orc_corrupt_zlib.orc
new file mode 100644
index 000000000..e083a07c8
Binary files /dev/null and b/java/core/src/test/resources/orc_corrupt_zlib.orc
differ