This is an automated email from the ASF dual-hosted git repository.

jbonofre pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-java.git


The following commit(s) were added to refs/heads/main by this push:
     new 0f8a0808f GH-343: Fix ListVector offset buffer not properly serialized 
for nested empty arrays (#967)
0f8a0808f is described below

commit 0f8a0808fd9cf0bd22d3c6b40a2016ee724ce185
Author: Yicong Huang <[email protected]>
AuthorDate: Fri Jan 23 01:18:13 2026 -0800

    GH-343: Fix ListVector offset buffer not properly serialized for nested 
empty arrays (#967)
    
    ## What's Changed
    
    Fix `ListVector`/`LargeListVector` IPC serialization when `valueCount`
    is 0.
    
    ### Problem
    
    When `valueCount == 0`, `setReaderAndWriterIndex()` was setting
    `offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC
    serializer uses `readableBytes()` to determine buffer size, so 0 bytes
    were written to the IPC stream. This crashes IPC readers in other
    libraries because Arrow spec requires offset buffer to have at least one
    entry `[0]`.
    
    @viirya:
    
    > The offset buffers are allocated properly. But during IPC
    serialization, they are ignored.
    > ```
    >   public long readableBytes() {
    >       return writerIndex - readerIndex;
    >   }
    > ```
    > So when ListVector.setReaderAndWriterIndex() sets writerIndex(0) and
    readerIndex(0), readableBytes() returns 0 - 0 = 0.
    >
    > Then when MessageSerializer.writeBatchBuffers() calls
    WriteChannel.write(buffer), it writes 0 bytes.
    >
    > So the flow is:
    >
    > valueCount=0 → ListVector.setReaderAndWriterIndex() sets
    offsetBuffer.writerIndex(0)
    > VectorUnloader.getFieldBuffers() returns the buffer with writerIndex=0
    > MessageSerializer.writeBatchBuffers() writes the buffer
    > WriteChannel.write(buffer) checks buffer.readableBytes() which is 0
    > 0 bytes are written to the IPC stream
    > PyArrow read the batch with the missing buffer → crash when other
    libraries to read
    
    ### Fix
    
    Simplify `setReaderAndWriterIndex()` to always use `(valueCount + 1) *
    OFFSET_WIDTH` for offset buffer's `writerIndex`. When `valueCount == 0`,
    this correctly sets `writerIndex` to `OFFSET_WIDTH`, ensuring
    `offset[0]` is included in serialization.
    
    ### Testing
    
    Added tests for nested empty lists verifying offset buffer has correct
    `readableBytes()`.
    
    Closes #343.
    
    ---------
    
    Co-authored-by: Yicong Huang <[email protected]>
---
 .../apache/arrow/vector/complex/LargeListVector.java |  7 +++++--
 .../org/apache/arrow/vector/complex/ListVector.java  |  7 +++++--
 .../org/apache/arrow/vector/TestLargeListVector.java | 20 ++++++++++++++++++++
 .../java/org/apache/arrow/vector/TestListVector.java | 20 ++++++++++++++++++++
 4 files changed, 50 insertions(+), 4 deletions(-)

diff --git 
a/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java 
b/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
index 997b5a8b7..92dd3eaef 100644
--- a/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
@@ -309,11 +309,14 @@ public class LargeListVector extends BaseValueVector
     offsetBuffer.readerIndex(0);
     if (valueCount == 0) {
       validityBuffer.writerIndex(0);
-      offsetBuffer.writerIndex(0);
     } else {
       
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
-      offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
     }
+    // IPC serializer will determine readable bytes based on `readerIndex` and 
`writerIndex`.
+    // Both are set to 0 means 0 bytes are written to the IPC stream which 
will crash IPC readers
+    // in other libraries. According to Arrow spec, we should still output the 
offset buffer which
+    // is [0].
+    offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
   }
 
   /**
diff --git 
a/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java 
b/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 93a313ef4..6c3993df6 100644
--- a/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -267,11 +267,14 @@ public class ListVector extends BaseRepeatedValueVector
     offsetBuffer.readerIndex(0);
     if (valueCount == 0) {
       validityBuffer.writerIndex(0);
-      offsetBuffer.writerIndex(0);
     } else {
       
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
-      offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
     }
+    // IPC serializer will determine readable bytes based on `readerIndex` and 
`writerIndex`.
+    // Both are set to 0 means 0 bytes are written to the IPC stream which 
will crash IPC readers
+    // in other libraries. According to Arrow spec, we should still output the 
offset buffer which
+    // is [0].
+    offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
   }
 
   /**
diff --git 
a/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java 
b/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
index ccc0d3e17..bf9bba9c7 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
@@ -1100,6 +1100,26 @@ public class TestLargeListVector {
     }
   }
 
+  @Test
+  public void testEmptyLargeListOffsetBuffer() {
+    // Test that LargeListVector has correct readableBytes after allocation.
+    // According to Arrow spec, offset buffer must have N+1 entries.
+    // Even when N=0, it should contain [0].
+    try (LargeListVector list = LargeListVector.empty("list", allocator)) {
+      list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+      list.allocateNew();
+      list.setValueCount(0);
+
+      List<ArrowBuf> buffers = list.getFieldBuffers();
+      assertTrue(
+          buffers.get(1).readableBytes() >= LargeListVector.OFFSET_WIDTH,
+          "Offset buffer should have at least "
+              + LargeListVector.OFFSET_WIDTH
+              + " bytes for offset[0]");
+      assertEquals(0L, list.getOffsetBuffer().getLong(0));
+    }
+  }
+
   private void writeIntValues(UnionLargeListWriter writer, int[] values) {
     writer.startList();
     for (int v : values) {
diff --git a/vector/src/test/java/org/apache/arrow/vector/TestListVector.java 
b/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index 1fe4c59f6..0c90b32ab 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -1379,6 +1379,26 @@ public class TestListVector {
     }
   }
 
+  @Test
+  public void testEmptyListOffsetBuffer() {
+    // Test that ListVector has correct readableBytes after allocation.
+    // According to Arrow spec, offset buffer must have N+1 entries.
+    // Even when N=0, it should contain [0].
+    try (ListVector list = ListVector.empty("list", allocator)) {
+      list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+      list.allocateNew();
+      list.setValueCount(0);
+
+      List<ArrowBuf> buffers = list.getFieldBuffers();
+      assertTrue(
+          buffers.get(1).readableBytes() >= 
BaseRepeatedValueVector.OFFSET_WIDTH,
+          "Offset buffer should have at least "
+              + BaseRepeatedValueVector.OFFSET_WIDTH
+              + " bytes for offset[0]");
+      assertEquals(0, list.getOffsetBuffer().getInt(0));
+    }
+  }
+
   private void writeIntValues(UnionListWriter writer, int[] values) {
     writer.startList();
     for (int v : values) {

Reply via email to