This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 8eaf92d4b ORC-1866: Avoid zlib decompression infinite loop
8eaf92d4b is described below

commit 8eaf92d4b586fa8dda415a45ccb0801c6cc034d9
Author: sychen <[email protected]>
AuthorDate: Mon Mar 31 07:13:10 2025 +0900

    ORC-1866: Avoid zlib decompression infinite loop
    
    ### What changes were proposed in this pull request?
    This PR aims to fix ZlibCodec decompression of damaged files can be fast 
fail.
    
    ### Why are the changes needed?
    
    This is a long-standing issue. The decompress method implemented by 
ZlibCodec may enter an infinite loop when encountering some corrupt files.
    
    jstack
    ```java
    "main" #1 [4611] prio=5 os_prio=31 cpu=55921.47ms elapsed=57.53s 
tid=0x0000000139014600 nid=4611 runnable  [0x000000016d9fa000]
       java.lang.Thread.State: RUNNABLE
            at java.util.zip.Inflater.inflateBytesBytes(java.base21.0.5/Native 
Method)
            at java.util.zip.Inflater.inflate(java.base21.0.5/Inflater.java:376)
            - locked <0x00000004367befc0> (a 
java.util.zip.Inflater$InflaterZStreamRef)
            at org.apache.orc.impl.ZlibCodec.decompress(ZlibCodec.java:168)
            at 
org.apache.orc.impl.InStream$CompressedStream.readHeader(InStream.java:521)
            at 
org.apache.orc.impl.InStream$CompressedStream.ensureUncompressed(InStream.java:548)
            at 
org.apache.orc.impl.InStream$CompressedStream.read(InStream.java:535)
            at 
org.apache.orc.impl.TreeReaderFactory$BytesColumnVectorUtil.commonReadByteArrays(TreeReaderFactory.java:2052)
        at 
org.apache.orc.impl.TreeReaderFactory$BytesColumnVectorUtil.readOrcByteArrays(TreeReaderFactory.java:2071)
            at 
org.apache.orc.impl.TreeReaderFactory$StringDirectTreeReader.nextVector(TreeReaderFactory.java:2169)
            at 
org.apache.orc.impl.TreeReaderFactory$StringTreeReader.nextVector(TreeReaderFactory.java:2001)
            at 
org.apache.orc.impl.reader.tree.StructBatchReader.readBatchColumn(StructBatchReader.java:65)
            at 
org.apache.orc.impl.reader.tree.StructBatchReader.nextBatchForLevel(StructBatchReader.java:100)
            at 
org.apache.orc.impl.reader.tree.StructBatchReader.nextBatch(StructBatchReader.java:77)
            at 
org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1432)
            at org.apache.orc.tools.PrintData.printJsonData(PrintData.java:208)
            at org.apache.orc.tools.PrintData.main(PrintData.java:288)
            at org.apache.orc.tools.Driver.main(Driver.java:120)
    ```
    
    ### How was this patch tested?
    1. local test
    2. Add UT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #2127 from cxzl25/zlib_infinite_loop.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../src/java/org/apache/orc/impl/ZlibCodec.java    |  11 ++++++++
 .../src/test/org/apache/orc/impl/TestZlib.java     |  28 +++++++++++++++++++++
 java/core/src/test/resources/orc_corrupt_zlib.orc  | Bin 0 -> 301 bytes
 3 files changed, 39 insertions(+)

diff --git a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java 
b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
index 398ac0d16..d4275a4c2 100644
--- a/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
+++ b/java/core/src/java/org/apache/orc/impl/ZlibCodec.java
@@ -169,6 +169,17 @@ public class ZlibCodec implements CompressionCodec, 
DirectDecompressionCodec {
                                        out.arrayOffset() + out.position(),
                                        out.remaining());
           out.position(count + out.position());
+
+          if (!inflater.finished() && !inflater.needsDictionary() && 
!inflater.needsInput() &&
+              count == 0) {
+            if (out.remaining() == 0) {
+              throw new IOException("Decompress output buffer too small. in = 
" + in +
+                  ", out = " + out);
+            } else {
+              throw new IOException("Decompress error. in = " + in +
+                  ", out = " + out);
+            }
+          }
         } catch (DataFormatException dfe) {
           throw new IOException("Bad compression data", dfe);
         }
diff --git a/java/core/src/test/org/apache/orc/impl/TestZlib.java 
b/java/core/src/test/org/apache/orc/impl/TestZlib.java
index 4ca62ca2a..6e940923e 100644
--- a/java/core/src/test/org/apache/orc/impl/TestZlib.java
+++ b/java/core/src/test/org/apache/orc/impl/TestZlib.java
@@ -18,13 +18,21 @@
 
 package org.apache.orc.impl;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.CompressionCodec;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
 import org.junit.jupiter.api.Test;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
 public class TestZlib {
@@ -54,4 +62,24 @@ public class TestZlib {
       // EXPECTED
     }
   }
+
+  @Test
+  public void testCorruptZlibFile() {
+    Configuration conf = new Configuration();
+    Path testFilePath = new Path(ClassLoader.
+        getSystemResource("orc_corrupt_zlib.orc").getPath());
+
+    IOException exception = assertThrows(
+        IOException.class,
+        () -> {
+          try (Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf))) {
+            RecordReader rows = reader.rows();
+            VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+            while (rows.nextBatch(batch)) {
+            }
+          }
+        }
+    );
+    assertTrue(exception.getMessage().contains("Decompress output buffer too 
small"));
+  }
 }
diff --git a/java/core/src/test/resources/orc_corrupt_zlib.orc 
b/java/core/src/test/resources/orc_corrupt_zlib.orc
new file mode 100644
index 000000000..e083a07c8
Binary files /dev/null and b/java/core/src/test/resources/orc_corrupt_zlib.orc 
differ

Reply via email to