This is an automated email from the ASF dual-hosted git repository.
jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 765c5b715d feat: Support roundtrip ListView in parquet arrow writer
(#9352)
765c5b715d is described below
commit 765c5b715ddf5be605169105e3e65429e107713d
Author: Yan Tingwang <[email protected]>
AuthorDate: Tue Feb 10 09:08:18 2026 +0800
feat: Support roundtrip ListView in parquet arrow writer (#9352)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9344 .
# Rationale for this change
This PR implements support for roundtrip reading and writing of ListView
and LargeListView types in Parquet Arrow integration.
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
- Handle ListView/LargeListView in Arrow ↔ Parquet schema conversion
- Reader/Writer support
- Added some tests
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
YES
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
Users can now write and read ListView/LargeListView arrays to/from
Parquet files
---------
Co-authored-by: Jeffrey Vo <[email protected]>
---
parquet/src/arrow/array_reader/builder.rs | 77 ++++---
parquet/src/arrow/array_reader/list_view_array.rs | 245 ++++++++++++++++++++++
parquet/src/arrow/array_reader/mod.rs | 2 +
parquet/src/arrow/arrow_writer/levels.rs | 139 +++++++++++-
parquet/src/arrow/arrow_writer/mod.rs | 141 ++++++++++++-
parquet/src/arrow/schema/complex.rs | 4 +
parquet/src/arrow/schema/mod.rs | 9 +-
7 files changed, 582 insertions(+), 35 deletions(-)
diff --git a/parquet/src/arrow/array_reader/builder.rs
b/parquet/src/arrow/array_reader/builder.rs
index 1b3c30ad36..818e06e8b8 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -29,9 +29,9 @@ use
crate::arrow::array_reader::row_group_cache::RowGroupCache;
use crate::arrow::array_reader::row_group_index::RowGroupIndexReader;
use crate::arrow::array_reader::row_number::RowNumberReader;
use crate::arrow::array_reader::{
- ArrayReader, FixedSizeListArrayReader, ListArrayReader, MapArrayReader,
NullArrayReader,
- PrimitiveArrayReader, RowGroups, StructArrayReader,
make_byte_array_dictionary_reader,
- make_byte_array_reader,
+ ArrayReader, FixedSizeListArrayReader, ListArrayReader,
ListViewArrayReader, MapArrayReader,
+ NullArrayReader, PrimitiveArrayReader, RowGroups, StructArrayReader,
+ make_byte_array_dictionary_reader, make_byte_array_reader,
};
use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics;
use crate::arrow::schema::{ParquetField, ParquetFieldType, VirtualColumnType};
@@ -178,8 +178,10 @@ impl<'a> ArrayReaderBuilder<'a> {
ParquetFieldType::Group { .. } => match &field.arrow_type {
DataType::Map(_, _) => self.build_map_reader(field, mask),
DataType::Struct(_) => self.build_struct_reader(field, mask),
- DataType::List(_) => self.build_list_reader(field, mask,
false),
- DataType::LargeList(_) => self.build_list_reader(field, mask,
true),
+ DataType::List(_)
+ | DataType::LargeList(_)
+ | DataType::ListView(_)
+ | DataType::LargeListView(_) => self.build_list_reader(field,
mask),
DataType::FixedSizeList(_, _) =>
self.build_fixed_size_list_reader(field, mask),
d => unimplemented!("reading group type {} not implemented",
d),
},
@@ -266,7 +268,6 @@ impl<'a> ArrayReaderBuilder<'a> {
&self,
field: &ParquetField,
mask: &ProjectionMask,
- is_large: bool,
) -> Result<Option<Box<dyn ArrayReader>>> {
let children = field.children().unwrap();
assert_eq!(children.len(), 1);
@@ -275,32 +276,56 @@ impl<'a> ArrayReaderBuilder<'a> {
Some(item_reader) => {
// Need to retrieve underlying data type to handle projection
let item_type = item_reader.get_data_type().clone();
- let data_type = match &field.arrow_type {
+ let reader: Box<dyn ArrayReader> = match &field.arrow_type {
DataType::List(f) => {
-
DataType::List(Arc::new(f.as_ref().clone().with_data_type(item_type)))
+ let data_type =
+
DataType::List(Arc::new(f.as_ref().clone().with_data_type(item_type)));
+ Box::new(ListArrayReader::<i32>::new(
+ item_reader,
+ data_type,
+ field.def_level,
+ field.rep_level,
+ field.nullable,
+ ))
}
DataType::LargeList(f) => {
-
DataType::LargeList(Arc::new(f.as_ref().clone().with_data_type(item_type)))
+ let data_type = DataType::LargeList(Arc::new(
+ f.as_ref().clone().with_data_type(item_type),
+ ));
+ Box::new(ListArrayReader::<i64>::new(
+ item_reader,
+ data_type,
+ field.def_level,
+ field.rep_level,
+ field.nullable,
+ ))
+ }
+ DataType::ListView(f) => {
+ let data_type = DataType::ListView(Arc::new(
+ f.as_ref().clone().with_data_type(item_type),
+ ));
+ Box::new(ListViewArrayReader::<i32>::new(
+ item_reader,
+ data_type,
+ field.def_level,
+ field.rep_level,
+ field.nullable,
+ ))
+ }
+ DataType::LargeListView(f) => {
+ let data_type = DataType::LargeListView(Arc::new(
+ f.as_ref().clone().with_data_type(item_type),
+ ));
+ Box::new(ListViewArrayReader::<i64>::new(
+ item_reader,
+ data_type,
+ field.def_level,
+ field.rep_level,
+ field.nullable,
+ ))
}
_ => unreachable!(),
};
-
- let reader = match is_large {
- false => Box::new(ListArrayReader::<i32>::new(
- item_reader,
- data_type,
- field.def_level,
- field.rep_level,
- field.nullable,
- )) as _,
- true => Box::new(ListArrayReader::<i64>::new(
- item_reader,
- data_type,
- field.def_level,
- field.rep_level,
- field.nullable,
- )) as _,
- };
Some(reader)
}
None => None,
diff --git a/parquet/src/arrow/array_reader/list_view_array.rs
b/parquet/src/arrow/array_reader/list_view_array.rs
new file mode 100644
index 0000000000..294135d41f
--- /dev/null
+++ b/parquet/src/arrow/array_reader/list_view_array.rs
@@ -0,0 +1,245 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::arrow::array_reader::{ArrayReader, ListArrayReader};
+use crate::errors::Result;
+use arrow_array::cast::AsArray;
+use arrow_array::{Array, ArrayRef, GenericListViewArray, OffsetSizeTrait,
new_empty_array};
+use arrow_schema::DataType as ArrowType;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Implementation of list view array reader.
+/// This wraps a ListArrayReader and converts the result to ListViewArray.
+pub struct ListViewArrayReader<OffsetSize: OffsetSizeTrait> {
+ inner: ListArrayReader<OffsetSize>,
+ data_type: ArrowType,
+}
+
+impl<OffsetSize: OffsetSizeTrait> ListViewArrayReader<OffsetSize> {
+ /// Construct list view array reader.
+ pub fn new(
+ item_reader: Box<dyn ArrayReader>,
+ data_type: ArrowType,
+ def_level: i16,
+ rep_level: i16,
+ nullable: bool,
+ ) -> Self {
+ // Create the underlying ListArrayReader with the corresponding List
type
+ let list_data_type = match &data_type {
+ ArrowType::ListView(f) => ArrowType::List(f.clone()),
+ ArrowType::LargeListView(f) => ArrowType::LargeList(f.clone()),
+ _ => unreachable!(),
+ };
+
+ let inner =
+ ListArrayReader::new(item_reader, list_data_type, def_level,
rep_level, nullable);
+
+ Self { inner, data_type }
+ }
+}
+
+impl<OffsetSize: OffsetSizeTrait> ArrayReader for
ListViewArrayReader<OffsetSize> {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ /// Returns data type.
+ /// This must be a ListView.
+ fn get_data_type(&self) -> &ArrowType {
+ &self.data_type
+ }
+
+ fn read_records(&mut self, batch_size: usize) -> Result<usize> {
+ self.inner.read_records(batch_size)
+ }
+
+ fn consume_batch(&mut self) -> Result<ArrayRef> {
+ let array = self.inner.consume_batch()?;
+ if array.is_empty() {
+ return Ok(new_empty_array(&self.data_type));
+ }
+
+ // Convert ListArray to ListViewArray
+ let list_array = array.as_list::<OffsetSize>();
+
+ let list_view_array =
+
Arc::new(GenericListViewArray::<OffsetSize>::from(list_array.clone()));
+
+ // Ensure the data type is correct
+ assert_eq!(
+ list_view_array.data_type(),
+ &self.data_type,
+ "Converted array type does not match expected type"
+ );
+
+ Ok(list_view_array)
+ }
+
+ fn skip_records(&mut self, num_records: usize) -> Result<usize> {
+ self.inner.skip_records(num_records)
+ }
+
+ fn get_def_levels(&self) -> Option<&[i16]> {
+ self.inner.get_def_levels()
+ }
+
+ fn get_rep_levels(&self) -> Option<&[i16]> {
+ self.inner.get_rep_levels()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::arrow::array_reader::test_util::InMemoryArrayReader;
+ use arrow::datatypes::Int32Type as ArrowInt32;
+ use arrow_array::PrimitiveArray;
+
+ fn test_nullable_list_view<OffsetSize: OffsetSizeTrait>() {
+ // [[1, null, 2], null, [], [3, 4], [], [], null, [], [null, 1]]
+ let expected =
+
GenericListViewArray::<OffsetSize>::from_iter_primitive::<ArrowInt32, _,
_>(vec![
+ Some(vec![Some(1), None, Some(2)]),
+ None,
+ Some(vec![]),
+ Some(vec![Some(3), Some(4)]),
+ Some(vec![]),
+ Some(vec![]),
+ None,
+ Some(vec![]),
+ Some(vec![None, Some(1)]),
+ ]);
+
+ let array = Arc::new(PrimitiveArray::<ArrowInt32>::from(vec![
+ Some(1),
+ None,
+ Some(2),
+ None,
+ None,
+ Some(3),
+ Some(4),
+ None,
+ None,
+ None,
+ None,
+ None,
+ Some(1),
+ ]));
+
+ let item_array_reader = InMemoryArrayReader::new(
+ ArrowType::Int32,
+ array,
+ Some(vec![3, 2, 3, 0, 1, 3, 3, 1, 1, 0, 1, 2, 3]),
+ Some(vec![0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]),
+ );
+
+ let field =
Arc::new(arrow_schema::Field::new_list_field(ArrowType::Int32, true));
+ let data_type = if OffsetSize::IS_LARGE {
+ ArrowType::LargeListView(field)
+ } else {
+ ArrowType::ListView(field)
+ };
+
+ let mut list_view_array_reader =
ListViewArrayReader::<OffsetSize>::new(
+ Box::new(item_array_reader),
+ data_type,
+ 2,
+ 1,
+ true,
+ );
+
+ let actual = list_view_array_reader.next_batch(1024).unwrap();
+ let actual = actual
+ .as_any()
+ .downcast_ref::<GenericListViewArray<OffsetSize>>()
+ .unwrap();
+
+ assert_eq!(&expected, actual)
+ }
+
+ fn test_required_list_view<OffsetSize: OffsetSizeTrait>() {
+ // [[1, null, 2], [], [3, 4], [], [], [null, 1]]
+ let expected =
+
GenericListViewArray::<OffsetSize>::from_iter_primitive::<ArrowInt32, _,
_>(vec![
+ Some(vec![Some(1), None, Some(2)]),
+ Some(vec![]),
+ Some(vec![Some(3), Some(4)]),
+ Some(vec![]),
+ Some(vec![]),
+ Some(vec![None, Some(1)]),
+ ]);
+
+ let array = Arc::new(PrimitiveArray::<ArrowInt32>::from(vec![
+ Some(1),
+ None,
+ Some(2),
+ None,
+ Some(3),
+ Some(4),
+ None,
+ None,
+ None,
+ Some(1),
+ ]));
+
+ let item_array_reader = InMemoryArrayReader::new(
+ ArrowType::Int32,
+ array,
+ Some(vec![2, 1, 2, 0, 2, 2, 0, 0, 1, 2]),
+ Some(vec![0, 1, 1, 0, 0, 1, 0, 0, 0, 1]),
+ );
+
+ let field =
Arc::new(arrow_schema::Field::new_list_field(ArrowType::Int32, true));
+ let data_type = if OffsetSize::IS_LARGE {
+ ArrowType::LargeListView(field)
+ } else {
+ ArrowType::ListView(field)
+ };
+
+ let mut list_view_array_reader =
ListViewArrayReader::<OffsetSize>::new(
+ Box::new(item_array_reader),
+ data_type,
+ 1,
+ 1,
+ false,
+ );
+
+ let actual = list_view_array_reader.next_batch(1024).unwrap();
+ let actual = actual
+ .as_any()
+ .downcast_ref::<GenericListViewArray<OffsetSize>>()
+ .unwrap();
+
+ assert_eq!(&expected, actual)
+ }
+
+ fn test_list_view_array<OffsetSize: OffsetSizeTrait>() {
+ test_nullable_list_view::<OffsetSize>();
+ test_required_list_view::<OffsetSize>();
+ }
+
+ #[test]
+ fn test_list_view_array_reader() {
+ test_list_view_array::<i32>();
+ }
+
+ #[test]
+ fn test_large_list_view_array_reader() {
+ test_list_view_array::<i64>()
+ }
+}
diff --git a/parquet/src/arrow/array_reader/mod.rs
b/parquet/src/arrow/array_reader/mod.rs
index 019a871e19..726eae1f51 100644
--- a/parquet/src/arrow/array_reader/mod.rs
+++ b/parquet/src/arrow/array_reader/mod.rs
@@ -39,6 +39,7 @@ mod empty_array;
mod fixed_len_byte_array;
mod fixed_size_list_array;
mod list_array;
+mod list_view_array;
mod map_array;
mod null_array;
mod primitive_array;
@@ -61,6 +62,7 @@ pub use byte_view_array::make_byte_view_array_reader;
pub use fixed_len_byte_array::make_fixed_len_byte_array_reader;
pub use fixed_size_list_array::FixedSizeListArrayReader;
pub use list_array::ListArrayReader;
+pub use list_view_array::ListViewArrayReader;
pub use map_array::MapArrayReader;
pub use null_array::NullArrayReader;
pub use primitive_array::PrimitiveArrayReader;
diff --git a/parquet/src/arrow/arrow_writer/levels.rs
b/parquet/src/arrow/arrow_writer/levels.rs
index 59bf6c6024..0ff2137d90 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -44,7 +44,7 @@ use crate::errors::{ParquetError, Result};
use arrow_array::cast::AsArray;
use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
use arrow_buffer::bit_iterator::BitIndexIterator;
-use arrow_buffer::{NullBuffer, OffsetBuffer};
+use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow_schema::{DataType, Field};
use std::ops::Range;
use std::sync::Arc;
@@ -131,6 +131,22 @@ enum LevelInfoBuilder {
usize, // List Size
Option<NullBuffer>, // Nulls
),
+ /// A list view array
+ ListView(
+ Box<LevelInfoBuilder>, // Child Values
+ LevelContext, // Context
+ ScalarBuffer<i32>, // Offsets
+ ScalarBuffer<i32>, // Sizes
+ Option<NullBuffer>, // Nulls
+ ),
+ /// A large list view array
+ LargeListView(
+ Box<LevelInfoBuilder>, // Child Values
+ LevelContext, // Context
+ ScalarBuffer<i64>, // Offsets
+ ScalarBuffer<i64>, // Sizes
+ Option<NullBuffer>, // Nulls
+ ),
/// A struct array
Struct(Vec<LevelInfoBuilder>, LevelContext, Option<NullBuffer>),
}
@@ -181,7 +197,9 @@ impl LevelInfoBuilder {
DataType::List(child)
| DataType::LargeList(child)
| DataType::Map(child, _)
- | DataType::FixedSizeList(child, _) => {
+ | DataType::FixedSizeList(child, _)
+ | DataType::ListView(child)
+ | DataType::LargeListView(child) => {
let def_level = match is_nullable {
true => parent_ctx.def_level + 2,
false => parent_ctx.def_level + 1,
@@ -219,6 +237,22 @@ impl LevelInfoBuilder {
let nulls = list.nulls().cloned();
Self::FixedSizeList(Box::new(child), ctx, *size as _,
nulls)
}
+ DataType::ListView(_) => {
+ let list = array.as_list_view();
+ let child = Self::try_new(child.as_ref(), ctx,
list.values())?;
+ let offsets = list.offsets().clone();
+ let sizes = list.sizes().clone();
+ let nulls = list.nulls().cloned();
+ Self::ListView(Box::new(child), ctx, offsets, sizes,
nulls)
+ }
+ DataType::LargeListView(_) => {
+ let list = array.as_list_view();
+ let child = Self::try_new(child.as_ref(), ctx,
list.values())?;
+ let offsets = list.offsets().clone();
+ let sizes = list.sizes().clone();
+ let nulls = list.nulls().cloned();
+ Self::LargeListView(Box::new(child), ctx, offsets,
sizes, nulls)
+ }
_ => unreachable!(),
})
}
@@ -233,7 +267,9 @@ impl LevelInfoBuilder {
LevelInfoBuilder::Primitive(v) => vec![v],
LevelInfoBuilder::List(v, _, _, _)
| LevelInfoBuilder::LargeList(v, _, _, _)
- | LevelInfoBuilder::FixedSizeList(v, _, _, _) => v.finish(),
+ | LevelInfoBuilder::FixedSizeList(v, _, _, _)
+ | LevelInfoBuilder::ListView(v, _, _, _, _)
+ | LevelInfoBuilder::LargeListView(v, _, _, _, _) => v.finish(),
LevelInfoBuilder::Struct(v, _, _) => v.into_iter().flat_map(|l|
l.finish()).collect(),
}
}
@@ -251,6 +287,12 @@ impl LevelInfoBuilder {
LevelInfoBuilder::FixedSizeList(child, ctx, size, nulls) => {
Self::write_fixed_size_list(child, ctx, *size, nulls.as_ref(),
range)
}
+ LevelInfoBuilder::ListView(child, ctx, offsets, sizes, nulls) => {
+ Self::write_list_view(child, ctx, offsets, sizes,
nulls.as_ref(), range)
+ }
+ LevelInfoBuilder::LargeListView(child, ctx, offsets, sizes, nulls)
=> {
+ Self::write_list_view(child, ctx, offsets, sizes,
nulls.as_ref(), range)
+ }
LevelInfoBuilder::Struct(children, ctx, nulls) => {
Self::write_struct(children, ctx, nulls.as_ref(), range)
}
@@ -342,6 +384,93 @@ impl LevelInfoBuilder {
}
}
+ /// Write `range` elements from ListViewArray `array`
+ fn write_list_view<O: OffsetSizeTrait>(
+ child: &mut LevelInfoBuilder,
+ ctx: &LevelContext,
+ offsets: &[O],
+ sizes: &[O],
+ nulls: Option<&NullBuffer>,
+ range: Range<usize>,
+ ) {
+ let offsets = &offsets[range.start..range.end];
+ let sizes = &sizes[range.start..range.end];
+
+ let write_non_null_slice =
+ |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| {
+ child.write(start_idx..end_idx);
+ child.visit_leaves(|leaf| {
+ let rep_levels = leaf.rep_levels.as_mut().unwrap();
+ let mut rev = rep_levels.iter_mut().rev();
+ let mut remaining = end_idx - start_idx;
+
+ loop {
+ let next = rev.next().unwrap();
+ if *next > ctx.rep_level {
+ // Nested element - ignore
+ continue;
+ }
+
+ remaining -= 1;
+ if remaining == 0 {
+ *next = ctx.rep_level - 1;
+ break;
+ }
+ }
+ })
+ };
+
+ let write_empty_slice = |child: &mut LevelInfoBuilder| {
+ child.visit_leaves(|leaf| {
+ let rep_levels = leaf.rep_levels.as_mut().unwrap();
+ rep_levels.push(ctx.rep_level - 1);
+ let def_levels = leaf.def_levels.as_mut().unwrap();
+ def_levels.push(ctx.def_level - 1);
+ })
+ };
+
+ let write_null_slice = |child: &mut LevelInfoBuilder| {
+ child.visit_leaves(|leaf| {
+ let rep_levels = leaf.rep_levels.as_mut().unwrap();
+ rep_levels.push(ctx.rep_level - 1);
+ let def_levels = leaf.def_levels.as_mut().unwrap();
+ def_levels.push(ctx.def_level - 2);
+ })
+ };
+
+ match nulls {
+ Some(nulls) => {
+ let null_offset = range.start;
+ // TODO: Faster bitmask iteration (#1757)
+ for (idx, (offset, size)) in
offsets.iter().zip(sizes.iter()).enumerate() {
+ let is_valid = nulls.is_valid(idx + null_offset);
+ let start_idx = offset.as_usize();
+ let size = size.as_usize();
+ let end_idx = start_idx + size;
+ if !is_valid {
+ write_null_slice(child)
+ } else if size == 0 {
+ write_empty_slice(child)
+ } else {
+ write_non_null_slice(child, start_idx, end_idx)
+ }
+ }
+ }
+ None => {
+ for (offset, size) in offsets.iter().zip(sizes.iter()) {
+ let start_idx = offset.as_usize();
+ let size = size.as_usize();
+ let end_idx = start_idx + size;
+ if size == 0 {
+ write_empty_slice(child)
+ } else {
+ write_non_null_slice(child, start_idx, end_idx)
+ }
+ }
+ }
+ }
+ }
+
/// Write `range` elements from StructArray `array`
fn write_struct(
children: &mut [LevelInfoBuilder],
@@ -535,7 +664,9 @@ impl LevelInfoBuilder {
LevelInfoBuilder::Primitive(info) => visit(info),
LevelInfoBuilder::List(c, _, _, _)
| LevelInfoBuilder::LargeList(c, _, _, _)
- | LevelInfoBuilder::FixedSizeList(c, _, _, _) =>
c.visit_leaves(visit),
+ | LevelInfoBuilder::FixedSizeList(c, _, _, _)
+ | LevelInfoBuilder::ListView(c, _, _, _, _)
+ | LevelInfoBuilder::LargeListView(c, _, _, _, _) =>
c.visit_leaves(visit),
LevelInfoBuilder::Struct(children, _, _) => {
for c in children {
c.visit_leaves(visit)
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 99d0455b31..2c8852f423 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1106,7 +1106,9 @@ impl ArrowColumnWriterFactory {
| ArrowDataType::Utf8View =>
out.push(bytes(leaves.next().unwrap())?),
ArrowDataType::List(f)
| ArrowDataType::LargeList(f)
- | ArrowDataType::FixedSizeList(f, _) => {
+ | ArrowDataType::FixedSizeList(f, _)
+ | ArrowDataType::ListView(f)
+ | ArrowDataType::LargeListView(f) => {
self.get_arrow_column_writer(f.data_type(), props, leaves,
out)?
}
ArrowDataType::Struct(fields) => {
@@ -1734,6 +1736,143 @@ mod tests {
roundtrip(batch, None);
}
+ #[test]
+ fn arrow_writer_list_view() {
+ let list_field = Arc::new(Field::new_list_field(DataType::Int32,
false));
+ let schema = Schema::new(vec![Field::new(
+ "a",
+ DataType::ListView(list_field.clone()),
+ true,
+ )]);
+
+ // [[1], [2, 3], null, [4, 5, 6], [7, 8, 9, 10]]
+ let a = ListViewArray::new(
+ list_field,
+ vec![0, 1, 0, 3, 6].into(),
+ vec![1, 2, 0, 3, 4].into(),
+ Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
+ Some(vec![true, true, false, true, true].into()),
+ );
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
+
+ assert_eq!(batch.column(0).null_count(), 1);
+
+ roundtrip(batch, None);
+ }
+
+ #[test]
+ fn arrow_writer_list_view_non_null() {
+ let list_field = Arc::new(Field::new_list_field(DataType::Int32,
false));
+ let schema = Schema::new(vec![Field::new(
+ "a",
+ DataType::ListView(list_field.clone()),
+ false,
+ )]);
+
+ // [[1], [2, 3], [], [4, 5, 6], [7, 8, 9, 10]]
+ let a = ListViewArray::new(
+ list_field,
+ vec![0, 1, 0, 3, 6].into(),
+ vec![1, 2, 0, 3, 4].into(),
+ Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
+ None,
+ );
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
+
+ assert_eq!(batch.column(0).null_count(), 0);
+
+ roundtrip(batch, None);
+ }
+
+ #[test]
+ fn arrow_writer_list_view_out_of_order() {
+ let list_field = Arc::new(Field::new_list_field(DataType::Int32,
false));
+ let schema = Schema::new(vec![Field::new(
+ "a",
+ DataType::ListView(list_field.clone()),
+ false,
+ )]);
+
+ // [[1], [2, 3], [], [7, 8, 9, 10], [4, 5, 6]] - out of order offsets
+ let a = ListViewArray::new(
+ list_field,
+ vec![0, 1, 0, 6, 3].into(),
+ vec![1, 2, 0, 4, 3].into(),
+ Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
+ None,
+ );
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
+
+ roundtrip(batch, None);
+ }
+
+ #[test]
+ fn arrow_writer_large_list_view() {
+ let list_field = Arc::new(Field::new_list_field(DataType::Int32,
false));
+ let schema = Schema::new(vec![Field::new(
+ "a",
+ DataType::LargeListView(list_field.clone()),
+ true,
+ )]);
+
+ // [[1], [2, 3], null, [4, 5, 6], [7, 8, 9, 10]]
+ let a = LargeListViewArray::new(
+ list_field,
+ vec![0i64, 1, 0, 3, 6].into(),
+ vec![1i64, 2, 0, 3, 4].into(),
+ Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
+ Some(vec![true, true, false, true, true].into()),
+ );
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
+
+ assert_eq!(batch.column(0).null_count(), 1);
+
+ roundtrip(batch, None);
+ }
+
+ #[test]
+ fn arrow_writer_list_view_with_struct() {
+ // Test ListView containing Struct: ListView<Struct<Int32, Utf8>>
+ let struct_fields = Fields::from(vec![
+ Field::new("id", DataType::Int32, false),
+ Field::new("name", DataType::Utf8, false),
+ ]);
+ let struct_type = DataType::Struct(struct_fields.clone());
+ let list_field = Arc::new(Field::new("item", struct_type.clone(),
false));
+
+ let schema = Schema::new(vec![Field::new(
+ "a",
+ DataType::ListView(list_field.clone()),
+ true,
+ )]);
+
+ // Create struct values
+ let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ let name_array = StringArray::from(vec!["a", "b", "c", "d", "e"]);
+ let struct_array = StructArray::new(
+ struct_fields,
+ vec![Arc::new(id_array), Arc::new(name_array)],
+ None,
+ );
+
+ // Create ListView: [{1, "a"}, {2, "b"}], null, [{3, "c"}, {4, "d"},
{5, "e"}]
+ let list_view = ListViewArray::new(
+ list_field,
+ vec![0, 2, 2].into(), // offsets
+ vec![2, 0, 3].into(), // sizes
+ Arc::new(struct_array),
+ Some(vec![true, false, true].into()),
+ );
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(list_view)]).unwrap();
+
+ roundtrip(batch, None);
+ }
+
#[test]
fn arrow_writer_binary() {
let string_field = Field::new("a", DataType::Utf8, false);
diff --git a/parquet/src/arrow/schema/complex.rs
b/parquet/src/arrow/schema/complex.rs
index fdb3943e85..5d6d4c1845 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -436,6 +436,8 @@ impl Visitor {
Some(DataType::List(f)) => Some(f.as_ref()),
Some(DataType::LargeList(f)) => Some(f.as_ref()),
Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()),
+ Some(DataType::ListView(f)) => Some(f.as_ref()),
+ Some(DataType::LargeListView(f)) => Some(f.as_ref()),
Some(d) => {
return Err(arrow_err!(
"incompatible arrow schema, expected list got {}",
@@ -519,6 +521,8 @@ impl Visitor {
Some(DataType::FixedSizeList(_, len)) => {
DataType::FixedSizeList(item_field, len)
}
+ Some(DataType::ListView(_)) =>
DataType::ListView(item_field),
+ Some(DataType::LargeListView(_)) =>
DataType::LargeListView(item_field),
_ => DataType::List(item_field),
};
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index b33f9c14dd..b2b93687ba 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -769,7 +769,11 @@ fn arrow_to_parquet_type(field: &Field, coerce_types:
bool) -> Result<Type> {
.with_repetition(repetition)
.with_id(id)
.build(),
- DataType::List(f) | DataType::FixedSizeList(f, _) |
DataType::LargeList(f) => {
+ DataType::List(f)
+ | DataType::FixedSizeList(f, _)
+ | DataType::LargeList(f)
+ | DataType::ListView(f)
+ | DataType::LargeListView(f) => {
let field_ref = if coerce_types && f.name() !=
PARQUET_LIST_ELEMENT_NAME {
// Ensure proper naming per the Parquet specification
let ff =
f.as_ref().clone().with_name(PARQUET_LIST_ELEMENT_NAME);
@@ -790,9 +794,6 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool)
-> Result<Type> {
.with_id(id)
.build()
}
- DataType::ListView(_) | DataType::LargeListView(_) => {
- unimplemented!("ListView/LargeListView not implemented")
- }
DataType::Struct(fields) => {
if fields.is_empty() {
return Err(arrow_err!("Parquet does not support writing empty
structs",));