This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new b00b5aa3bb fix(ipc): correct skip_field handling for V4 Union (#9829)
b00b5aa3bb is described below
commit b00b5aa3bbe3aab8ea6595fdd99794a11c1ca730
Author: pchintar <[email protected]>
AuthorDate: Mon Apr 27 16:08:19 2026 -0400
fix(ipc): correct skip_field handling for V4 Union (#9829)
# Which issue does this PR close?
- Closes #9828 .
# Rationale for this change
Currently, `skip_field` does not correctly handle the buffer layout of
`Union` types for V4 IPC.
In V4:
* `Union` includes a null buffer + type_ids (+ offsets for dense)
In V5:
* `Union` has no null buffer, only type_ids (+ offsets for dense)
`create_array` correctly handles this difference using a version check.
However, `skip_field` always assumes a null buffer and does not skip
`type_ids`, leading to buffer misalignment when skipping a `Union`
column in V4.
This can cause incorrect decoding or runtime errors for projected
columns.
# What changes are included in this PR?
* Updated `skip_field` in `arrow-ipc/src/reader.rs` to:
* conditionally skip the null buffer only for V4
* explicitly skip the `type_ids` buffer
* correctly handle dense vs sparse offsets
* Aligns `skip_field` behavior with `create_array` and actual IPC layout
# Are these changes tested?
Yes.
* Added test: `test_projection_skip_union_v4`
* The test:
* writes IPC data using V4 metadata
* includes a `Union` column followed by an `Int32` column
* reads only the second column (skipping the `Union`)
* verifies the output matches expected values
* The test fails before the fix and passes after
* All existing `arrow-ipc` tests pass (`cargo test -p arrow-ipc --lib`)
# Are there any user-facing changes?
No.
---
arrow-ipc/src/reader.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 59 insertions(+), 2 deletions(-)
diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index aa66696271..1d5e06c687 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -696,10 +696,13 @@ impl<'a> RecordBatchDecoder<'a> {
self.skip_buffer(); // Indices
}
Union(fields, mode) => {
- self.skip_buffer(); // Nulls
+ if self.version < MetadataVersion::V5 {
+ self.skip_buffer(); // Null buffer
+ }
+ self.skip_buffer(); // Type ids
match mode {
- UnionMode::Dense => self.skip_buffer(),
+ UnionMode::Dense => self.skip_buffer(), // Offsets
UnionMode::Sparse => {}
};
@@ -3543,6 +3546,60 @@ mod tests {
assert_eq!(read_batch.column(0).as_ref(), &values);
}
+ // Tests reading a column when a preceding V4 Union column is skipped.
+ // V4 Union columns include a null buffer and type ids (and offsets for
dense unions).
+ #[test]
+ fn test_projection_skip_union_v4() {
+ use crate::MetadataVersion;
+ use crate::reader::FileReader;
+ use crate::writer::{FileWriter, IpcWriteOptions};
+ use arrow_array::{
+ ArrayRef, Int32Array, RecordBatch, builder::UnionBuilder,
types::Int32Type,
+ };
+ use arrow_schema::{DataType, Field, Schema};
+ use std::sync::Arc;
+
+ // Build a dense Union column with simple Int32 values
+ let mut builder = UnionBuilder::new_dense();
+ builder.append::<Int32Type>("a", 1).unwrap();
+ builder.append::<Int32Type>("a", 2).unwrap();
+ builder.append::<Int32Type>("a", 3).unwrap();
+ let union = builder.build().unwrap();
+
+ // Second column with known values to verify correctness after
projection
+ let values = Int32Array::from(vec![10, 20, 30]);
+
+ // Schema: first column is Union (to be skipped), second is Int32 (to
be read)
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("union", union.data_type().clone(), false),
+ Field::new("values", DataType::Int32, false),
+ ]));
+
+ // Create a batch containing both columns
+ let batch = RecordBatch::try_new(
+ schema,
+ vec![Arc::new(union) as ArrayRef, Arc::new(values.clone())],
+ )
+ .unwrap();
+
+ // Write IPC using V4 metadata to trigger Union null buffer behavior
+ let mut buf = Vec::new();
+ {
+ let options = IpcWriteOptions::try_new(8, false,
MetadataVersion::V4).unwrap();
+ let mut writer =
+ FileWriter::try_new_with_options(&mut buf, &batch.schema(),
options).unwrap();
+ writer.write(&batch).unwrap();
+ writer.finish().unwrap();
+ }
+ // Read only the second column (skip the Union column)
+ let mut reader = FileReader::try_new(std::io::Cursor::new(buf),
Some(vec![1])).unwrap();
+ let read_batch = reader.next().unwrap().unwrap();
+
+ // Verify that the projected column is read correctly after skipping
Union
+ assert_eq!(read_batch.num_columns(), 1);
+ assert_eq!(read_batch.column(0).as_ref(), &values);
+ }
+
// Tests reading a column when preceding fixed-width and boolean columns
are skipped.
// Covers all types that use the same two-buffer layout (null + values).
// Verifies that skipping these types does not affect subsequent column
decoding.