This is an automated email from the ASF dual-hosted git repository.

jbonofre pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-java.git


The following commit(s) were added to refs/heads/main by this push:
     new 77df3ecb2 GH-343: Fix BaseVariableWidthVector and 
BaseLargeVariableWidthVector offset buffer serialization (#989)
77df3ecb2 is described below

commit 77df3ecb2cf5517fb5d37a4b2806844e3b4700df
Author: Yicong Huang <[email protected]>
AuthorDate: Thu Mar 12 01:39:44 2026 -0700

    GH-343: Fix BaseVariableWidthVector and BaseLargeVariableWidthVector offset 
buffer serialization (#989)
    
    ## What's Changed
    
    Fix `BaseVariableWidthVector`/`BaseLargeVariableWidthVector` IPC
    serialization when `valueCount` is 0.
    
    ### Problem
    
    When `valueCount == 0`, `setReaderAndWriterIndex()` was setting
    `offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC
    serializer uses `readableBytes()` to determine buffer size, so 0 bytes
    were written to the IPC stream. This crashes IPC readers in other
    libraries because Arrow spec requires offset buffer to have at least one
    entry `[0]`.
    
    This is a follow-up to #967 which fixed the same issue in
    `ListVector`/`LargeListVector`.
    
    ### Fix
    
    Modify `setReaderAndWriterIndex()` to always use `(valueCount + 1) *
    OFFSET_WIDTH` for the offset buffer's `writerIndex`, moved outside the
    if/else branch. When the offset buffer capacity is insufficient (e.g.,
    empty buffer from constructor or loaded via `loadFieldBuffers()`), it
    reallocates a properly sized buffer on demand.
    
    
    ### Testing
    
    Added tests for empty `VarCharVector` and `LargeVarCharVector` verifying
    offset buffer has correct `readableBytes()` after `setValueCount(0)`.
    
    
    Closes #343
    
    ---------
    
    Co-authored-by: Yicong Huang <[email protected]>
---
 .../arrow/adapter/jdbc/ResultSetUtilityTest.java   | 22 ++++++++-----
 .../arrow/vector/BaseLargeVariableWidthVector.java | 16 +++++++--
 .../arrow/vector/BaseVariableWidthVector.java      | 16 +++++++--
 .../org/apache/arrow/vector/TestValueVector.java   | 38 ++++++++++++++++++++++
 4 files changed, 79 insertions(+), 13 deletions(-)

diff --git 
a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
 
b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
index c7dc9b279..e5039ccf5 100644
--- 
a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
+++ 
b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
@@ -43,15 +43,19 @@ public class ResultSetUtilityTest {
                 .setReuseVectorSchemaRoot(reuseVectorSchemaRoot)
                 .build();
 
-        ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, 
config);
-        assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should 
haveNext() before use");
-        VectorSchemaRoot root = iter.next();
-        assertNotNull(root, "VectorSchemaRoot from first next() result should 
never be null");
-        assertEquals(
-            0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet 
should have zero rows");
-        assertFalse(
-            iter.hasNext(),
-            "hasNext() should return false on empty ResultSets after initial 
next() call");
+        try (ArrowVectorIterator iter = 
JdbcToArrow.sqlToArrowVectorIterator(rs, config)) {
+          assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should 
haveNext() before use");
+          VectorSchemaRoot root = iter.next();
+          assertNotNull(root, "VectorSchemaRoot from first next() result 
should never be null");
+          assertEquals(
+              0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet 
should have zero rows");
+          assertFalse(
+              iter.hasNext(),
+              "hasNext() should return false on empty ResultSets after initial 
next() call");
+          if (!reuseVectorSchemaRoot) {
+            root.close();
+          }
+        }
       }
     }
   }
diff --git 
a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
 
b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
index 6c451f10a..3fac19578 100644
--- 
a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
+++ 
b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
@@ -373,14 +373,26 @@ public abstract class BaseLargeVariableWidthVector 
extends BaseValueVector
     valueBuffer.readerIndex(0);
     if (valueCount == 0) {
       validityBuffer.writerIndex(0);
-      offsetBuffer.writerIndex(0);
       valueBuffer.writerIndex(0);
     } else {
       final long lastDataOffset = getStartOffset(valueCount);
       
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
-      offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
       valueBuffer.writerIndex(lastDataOffset);
     }
+    // IPC serializer will determine readable bytes based on `readerIndex` and 
`writerIndex`.
+    // Both are set to 0 means 0 bytes are written to the IPC stream which 
will crash IPC readers
+    // in other libraries. According to Arrow spec, we should still output the 
offset buffer which
+    // is [0].
+    final long requiredOffsetBufferSize = (long) (valueCount + 1) * 
OFFSET_WIDTH;
+    if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
+      ArrowBuf newOffsetBuffer = 
allocateOffsetBuffer(requiredOffsetBufferSize);
+      if (offsetBuffer.capacity() > 0) {
+        newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
+      }
+      offsetBuffer.getReferenceManager().release();
+      offsetBuffer = newOffsetBuffer;
+    }
+    offsetBuffer.writerIndex(requiredOffsetBufferSize);
   }
 
   /** Same as {@link #allocateNewSafe()}. */
diff --git 
a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java 
b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index 96e2afbd2..d5bd16725 100644
--- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -389,14 +389,26 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
     valueBuffer.readerIndex(0);
     if (valueCount == 0) {
       validityBuffer.writerIndex(0);
-      offsetBuffer.writerIndex(0);
       valueBuffer.writerIndex(0);
     } else {
       final int lastDataOffset = getStartOffset(valueCount);
       
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
-      offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
       valueBuffer.writerIndex(lastDataOffset);
     }
+    // IPC serializer will determine readable bytes based on `readerIndex` and 
`writerIndex`.
+    // Both are set to 0 means 0 bytes are written to the IPC stream which 
will crash IPC readers
+    // in other libraries. According to Arrow spec, we should still output the 
offset buffer which
+    // is [0].
+    final long requiredOffsetBufferSize = (long) (valueCount + 1) * 
OFFSET_WIDTH;
+    if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
+      ArrowBuf newOffsetBuffer = 
allocateOffsetBuffer(requiredOffsetBufferSize);
+      if (offsetBuffer.capacity() > 0) {
+        newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
+      }
+      offsetBuffer.getReferenceManager().release();
+      offsetBuffer = newOffsetBuffer;
+    }
+    offsetBuffer.writerIndex(requiredOffsetBufferSize);
   }
 
   /** Same as {@link #allocateNewSafe()}. */
diff --git a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java 
b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index df42d04e6..22c93b0cb 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -3940,4 +3940,42 @@ public class TestValueVector {
       }
     }
   }
+
+  @Test
+  public void testEmptyVarCharOffsetBuffer() {
+    // Validates that offset buffer has at least OFFSET_WIDTH bytes (for 
offset[0]=0)
+    // even when valueCount is 0, per Arrow specification.
+    try (VarCharVector vector = newVarCharVector("varchar", allocator)) {
+      vector.allocateNew();
+      vector.setValueCount(0);
+
+      List<ArrowBuf> buffers = vector.getFieldBuffers();
+      // buffers: [validity, offset, data]
+      assertTrue(
+          buffers.get(1).readableBytes() >= 
BaseVariableWidthVector.OFFSET_WIDTH,
+          "Offset buffer should have at least "
+              + BaseVariableWidthVector.OFFSET_WIDTH
+              + " bytes for offset[0]");
+      assertEquals(0, vector.getOffsetBuffer().getInt(0));
+    }
+  }
+
+  @Test
+  public void testEmptyLargeVarCharOffsetBuffer() {
+    // Validates that offset buffer has at least OFFSET_WIDTH bytes (for 
offset[0]=0)
+    // even when valueCount is 0, per Arrow specification.
+    try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", 
allocator)) {
+      vector.allocateNew();
+      vector.setValueCount(0);
+
+      List<ArrowBuf> buffers = vector.getFieldBuffers();
+      // buffers: [validity, offset, data]
+      assertTrue(
+          buffers.get(1).readableBytes() >= 
BaseLargeVariableWidthVector.OFFSET_WIDTH,
+          "Offset buffer should have at least "
+              + BaseLargeVariableWidthVector.OFFSET_WIDTH
+              + " bytes for offset[0]");
+      assertEquals(0, vector.getOffsetBuffer().getLong(0));
+    }
+  }
 }

Reply via email to