This is an automated email from the ASF dual-hosted git repository.
jbonofre pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-java.git
The following commit(s) were added to refs/heads/main by this push:
new 0f8a0808f GH-343: Fix ListVector offset buffer not properly serialized
for nested empty arrays (#967)
0f8a0808f is described below
commit 0f8a0808fd9cf0bd22d3c6b40a2016ee724ce185
Author: Yicong Huang <[email protected]>
AuthorDate: Fri Jan 23 01:18:13 2026 -0800
GH-343: Fix ListVector offset buffer not properly serialized for nested
empty arrays (#967)
## What's Changed
Fix `ListVector`/`LargeListVector` IPC serialization when `valueCount`
is 0.
### Problem
When `valueCount == 0`, `setReaderAndWriterIndex()` was setting
`offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC
serializer uses `readableBytes()` to determine buffer size, so 0 bytes
were written to the IPC stream. This crashes IPC readers in other
libraries because Arrow spec requires offset buffer to have at least one
entry `[0]`.
@viirya:
> The offset buffers are allocated properly. But during IPC
serialization, they are ignored.
> ```
> public long readableBytes() {
> return writerIndex - readerIndex;
> }
> ```
> So when ListVector.setReaderAndWriterIndex() sets writerIndex(0) and
readerIndex(0), readableBytes() returns 0 - 0 = 0.
>
> Then when MessageSerializer.writeBatchBuffers() calls
WriteChannel.write(buffer), it writes 0 bytes.
>
> So the flow is:
>
> valueCount=0 → ListVector.setReaderAndWriterIndex() sets
offsetBuffer.writerIndex(0)
> VectorUnloader.getFieldBuffers() returns the buffer with writerIndex=0
> MessageSerializer.writeBatchBuffers() writes the buffer
> WriteChannel.write(buffer) checks buffer.readableBytes() which is 0
> 0 bytes are written to the IPC stream
> PyArrow read the batch with the missing buffer → crash when other
libraries to read
### Fix
Simplify `setReaderAndWriterIndex()` to always use `(valueCount + 1) *
OFFSET_WIDTH` for offset buffer's `writerIndex`. When `valueCount == 0`,
this correctly sets `writerIndex` to `OFFSET_WIDTH`, ensuring
`offset[0]` is included in serialization.
### Testing
Added tests for nested empty lists verifying offset buffer has correct
`readableBytes()`.
Closes #343.
---------
Co-authored-by: Yicong Huang <[email protected]>
---
.../apache/arrow/vector/complex/LargeListVector.java | 7 +++++--
.../org/apache/arrow/vector/complex/ListVector.java | 7 +++++--
.../org/apache/arrow/vector/TestLargeListVector.java | 20 ++++++++++++++++++++
.../java/org/apache/arrow/vector/TestListVector.java | 20 ++++++++++++++++++++
4 files changed, 50 insertions(+), 4 deletions(-)
diff --git
a/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
b/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
index 997b5a8b7..92dd3eaef 100644
--- a/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java
@@ -309,11 +309,14 @@ public class LargeListVector extends BaseValueVector
offsetBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
- offsetBuffer.writerIndex(0);
} else {
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
- offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
}
+ // IPC serializer will determine readable bytes based on `readerIndex` and
`writerIndex`.
+ // Both are set to 0 means 0 bytes are written to the IPC stream which
will crash IPC readers
+ // in other libraries. According to Arrow spec, we should still output the
offset buffer which
+ // is [0].
+ offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
}
/**
diff --git
a/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
b/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 93a313ef4..6c3993df6 100644
--- a/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -267,11 +267,14 @@ public class ListVector extends BaseRepeatedValueVector
offsetBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
- offsetBuffer.writerIndex(0);
} else {
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
- offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
}
+ // IPC serializer will determine readable bytes based on `readerIndex` and
`writerIndex`.
+ // Both are set to 0 means 0 bytes are written to the IPC stream which
will crash IPC readers
+ // in other libraries. According to Arrow spec, we should still output the
offset buffer which
+ // is [0].
+ offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
}
/**
diff --git
a/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
b/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
index ccc0d3e17..bf9bba9c7 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestLargeListVector.java
@@ -1100,6 +1100,26 @@ public class TestLargeListVector {
}
}
+ @Test
+ public void testEmptyLargeListOffsetBuffer() {
+ // Test that LargeListVector has correct readableBytes after allocation.
+ // According to Arrow spec, offset buffer must have N+1 entries.
+ // Even when N=0, it should contain [0].
+ try (LargeListVector list = LargeListVector.empty("list", allocator)) {
+ list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+ list.allocateNew();
+ list.setValueCount(0);
+
+ List<ArrowBuf> buffers = list.getFieldBuffers();
+ assertTrue(
+ buffers.get(1).readableBytes() >= LargeListVector.OFFSET_WIDTH,
+ "Offset buffer should have at least "
+ + LargeListVector.OFFSET_WIDTH
+ + " bytes for offset[0]");
+ assertEquals(0L, list.getOffsetBuffer().getLong(0));
+ }
+ }
+
private void writeIntValues(UnionLargeListWriter writer, int[] values) {
writer.startList();
for (int v : values) {
diff --git a/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
b/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index 1fe4c59f6..0c90b32ab 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -1379,6 +1379,26 @@ public class TestListVector {
}
}
+ @Test
+ public void testEmptyListOffsetBuffer() {
+ // Test that ListVector has correct readableBytes after allocation.
+ // According to Arrow spec, offset buffer must have N+1 entries.
+ // Even when N=0, it should contain [0].
+ try (ListVector list = ListVector.empty("list", allocator)) {
+ list.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+ list.allocateNew();
+ list.setValueCount(0);
+
+ List<ArrowBuf> buffers = list.getFieldBuffers();
+ assertTrue(
+ buffers.get(1).readableBytes() >=
BaseRepeatedValueVector.OFFSET_WIDTH,
+ "Offset buffer should have at least "
+ + BaseRepeatedValueVector.OFFSET_WIDTH
+ + " bytes for offset[0]");
+ assertEquals(0, list.getOffsetBuffer().getInt(0));
+ }
+ }
+
private void writeIntValues(UnionListWriter writer, int[] values) {
writer.startList();
for (int v : values) {