This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new c4bd03a1c ORC-1373: Provide directional exceptions instead of NPE in 
case of `DynamicByteArray` length overflow
c4bd03a1c is described below

commit c4bd03a1c7805bfc2088ba31c8bccc986d2d511c
Author: sychen <[email protected]>
AuthorDate: Sun Feb 19 13:33:24 2023 -0800

    ORC-1373: Provide directional exceptions instead of NPE in case of 
`DynamicByteArray` length overflow
    
    ### What changes were proposed in this pull request?
    
    When `DynamicByteArray` calculates `chunkIndex` overflow, it will throw NPE.
    We can throw exception to remind users to avoid this problem by configuring 
what ORC parameters.
    
    ### Why are the changes needed?
    
    When the written string is very large, the grow calculation may overflow, 
causing the array data not to be expanded, and then NPE.
    
    org.apache.orc.impl.DynamicByteArray#add(byte[], int, int)
    ```java
    grow((length + valueLength) / chunkSize);
    ```
    
    #### Log
    
    ```java
    Caused by: java.lang.NullPointerException
            at java.lang.System.arraycopy(Native Method)
            at 
org.apache.orc.impl.DynamicByteArray.add(DynamicByteArray.java:115)
            at 
org.apache.orc.impl.StringRedBlackTree.addNewKey(StringRedBlackTree.java:48)
            at 
org.apache.orc.impl.StringRedBlackTree.add(StringRedBlackTree.java:60)
            at 
org.apache.orc.impl.writer.StringTreeWriter.writeBatch(StringTreeWriter.java:69)
            at 
org.apache.orc.impl.writer.StructTreeWriter.writeRootBatch(StructTreeWriter.java:56)
            at org.apache.orc.impl.WriterImpl.addRowBatch(WriterImpl.java:696)
    ```
    
    ### How was this patch tested?
    
    #### Local Test
    org.apache.orc.impl.TestDynamicArray#testBigByteArray
    ```java
     Test
      public void testBigByteArray() {
        DynamicByteArray dba = new DynamicByteArray(128, 32 * 1024);
    
        byte[] val = new byte[1024];
        dba.add(val, 0, val.length);
    
        byte[] bigVal = new byte[Integer.MAX_VALUE - 16];
        dba.add(bigVal, 0, bigVal.length);
      }
    ```
    
    #### Output
    
    ```java
    java.lang.RuntimeException: chunkIndex overflow:-65535. You can set 
orc.column.encoding.direct=columnName, or orc.dictionary.key.threshold=0 to 
turn off dictionary encoding.
            at 
org.apache.orc.impl.DynamicByteArray.grow(DynamicByteArray.java:73)
            at 
org.apache.orc.impl.DynamicByteArray.add(DynamicByteArray.java:122)
            at 
org.apache.orc.impl.TestDynamicArray.testBigByteArray(TestDynamicArray.java:36)
    ```
    
    Closes #1412 from cxzl25/ORC-1373.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../java/org/apache/orc/impl/DynamicByteArray.java | 11 ++++++++++-
 .../test/org/apache/orc/impl/TestDynamicArray.java | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java 
b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
index 851f0f144..d88cbcd45 100644
--- a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
+++ b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
@@ -18,6 +18,7 @@
 package org.apache.orc.impl;
 
 import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcConf;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -52,14 +53,22 @@ public final class DynamicByteArray {
 
   /**
    * Ensure that the given index is valid.
+   * Throws an exception if chunkIndex is negative.
    */
   private void grow(int chunkIndex) {
+    if (chunkIndex < 0) {
+      throw new RuntimeException(String.format("chunkIndex overflow:%d. " +
+        "You can set %s=columnName, or %s=0 to turn off dictionary encoding.",
+        chunkIndex,
+        OrcConf.DIRECT_ENCODING_COLUMNS.getAttribute(),
+        OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute()));
+    }
     if (chunkIndex >= initializedChunks) {
       if (chunkIndex >= data.length) {
         int newSize = Math.max(chunkIndex + 1, 2 * data.length);
         data = Arrays.copyOf(data, newSize);
       }
-      for(int i=initializedChunks; i <= chunkIndex; ++i) {
+      for (int i = initializedChunks; i <= chunkIndex; ++i) {
         data[i] = new byte[chunkSize];
       }
       initializedChunks = chunkIndex + 1;
diff --git a/java/core/src/test/org/apache/orc/impl/TestDynamicArray.java 
b/java/core/src/test/org/apache/orc/impl/TestDynamicArray.java
index 1179b333c..5c4f199e6 100644
--- a/java/core/src/test/org/apache/orc/impl/TestDynamicArray.java
+++ b/java/core/src/test/org/apache/orc/impl/TestDynamicArray.java
@@ -22,6 +22,7 @@ import org.junit.jupiter.api.Test;
 import java.util.Random;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class TestDynamicArray {
 
@@ -91,4 +92,25 @@ public class TestDynamicArray {
     DynamicIntArray dia = new DynamicIntArray();
     assertEquals("{}", dia.toString());
   }
+
+  @Test
+  public void testByteArrayOverflow() {
+    DynamicByteArray dba = new DynamicByteArray();
+
+    byte[] val = new byte[1024];
+    dba.add(val, 0, val.length);
+
+    byte[] bigVal = new byte[2048];
+    RuntimeException exception = assertThrows(
+      RuntimeException.class,
+      // Need to construct a large array, limited by the heap limit of UT, may 
cause OOM.
+      // The add method does not check whether byte[] and length are 
consistent,
+      // so it is a bit hacky.
+      () -> dba.add(bigVal, 0, Integer.MAX_VALUE - 16));
+
+    assertEquals("chunkIndex overflow:-65535. " +
+      "You can set orc.column.encoding.direct=columnName, " +
+      "or orc.dictionary.key.threshold=0 to turn off dictionary encoding.",
+      exception.getMessage());
+  }
 }

Reply via email to