This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6bf3b3a Fix reading/writing nested null arrays (#1480) (#1036)
(#1399) (#1481)
6bf3b3a is described below
commit 6bf3b3af5585c50026a314c26fdaf5b9b29d01e0
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Fri Mar 25 16:43:52 2022 +0000
Fix reading/writing nested null arrays (#1480) (#1036) (#1399) (#1481)
---
parquet/src/arrow/array_reader.rs | 7 +++--
parquet/src/arrow/arrow_writer.rs | 56 +++++++++++++++++++++++++++++++++++++++
parquet/src/arrow/levels.rs | 10 +++----
3 files changed, 64 insertions(+), 9 deletions(-)
diff --git a/parquet/src/arrow/array_reader.rs
b/parquet/src/arrow/array_reader.rs
index 9c016e7..756dffc 100644
--- a/parquet/src/arrow/array_reader.rs
+++ b/parquet/src/arrow/array_reader.rs
@@ -215,11 +215,10 @@ where
/// Reads at most `batch_size` records into array.
fn next_batch(&mut self, batch_size: usize) -> Result<ArrayRef> {
- let records_read =
- read_records(&mut self.record_reader, self.pages.as_mut(),
batch_size)?;
+ read_records(&mut self.record_reader, self.pages.as_mut(),
batch_size)?;
// convert to arrays
- let array = arrow::array::NullArray::new(records_read);
+ let array =
arrow::array::NullArray::new(self.record_reader.num_values());
// save definition and repetition buffers
self.def_levels_buffer = self.record_reader.consume_def_levels()?;
@@ -877,7 +876,7 @@ fn remove_indices(
Ok(Arc::new(StructArray::from((new_columns, valid.finish()))))
}
}
- ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len()))),
+ ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len() -
indices.len()))),
_ => Err(ParquetError::General(format!(
"ListArray of type List({:?}) is not supported by array_reader",
item_type
diff --git a/parquet/src/arrow/arrow_writer.rs
b/parquet/src/arrow/arrow_writer.rs
index 6a9310d..3a53c2c 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -1514,6 +1514,43 @@ mod tests {
}
#[test]
+ fn null_list_single_column() {
+ let null_field = Field::new("item", DataType::Null, true);
+ let list_field =
+ Field::new("emptylist", DataType::List(Box::new(null_field)),
true);
+
+ let schema = Schema::new(vec![list_field]);
+
+ // Build [[], null, [null, null]]
+ let a_values = NullArray::new(2);
+ let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0, 0,
2].to_byte_slice());
+ let a_list_data =
ArrayData::builder(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Null,
+ true,
+ ))))
+ .len(3)
+ .add_buffer(a_value_offsets)
+ .null_bit_buffer(Buffer::from(vec![0b00000101]))
+ .add_child_data(a_values.data().clone())
+ .build()
+ .unwrap();
+
+ let a = ListArray::from(a_list_data);
+
+ assert!(a.is_valid(0));
+ assert!(!a.is_valid(1));
+ assert!(a.is_valid(2));
+
+ assert_eq!(a.value(0).len(), 0);
+ assert_eq!(a.value(2).len(), 2);
+ assert_eq!(a.value(2).null_count(), 2);
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
+ roundtrip(batch, None);
+ }
+
+ #[test]
fn list_single_column() {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let a_value_offsets =
@@ -1565,6 +1602,25 @@ mod tests {
}
#[test]
+ fn list_nested_nulls() {
+ use arrow::datatypes::Int32Type;
+ let data = vec![
+ Some(vec![Some(1)]),
+ Some(vec![Some(2), Some(3)]),
+ None,
+ Some(vec![Some(4), Some(5), None]),
+ Some(vec![None]),
+ Some(vec![Some(6), Some(7)]),
+ ];
+
+ let list = ListArray::from_iter_primitive::<Int32Type, _,
_>(data.clone());
+ one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));
+
+ let list = LargeListArray::from_iter_primitive::<Int32Type, _,
_>(data);
+ one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));
+ }
+
+ #[test]
fn struct_single_column() {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let struct_field_a = Field::new("f", DataType::Int32, false);
diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs
index 8c92405..a1979e5 100644
--- a/parquet/src/arrow/levels.rs
+++ b/parquet/src/arrow/levels.rs
@@ -200,9 +200,8 @@ impl LevelInfo {
);
match child_array.data_type() {
- // TODO: The behaviour of a <list<null>> is untested
- DataType::Null => vec![list_level],
- DataType::Boolean
+ DataType::Null
+ | DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32
@@ -677,8 +676,9 @@ impl LevelInfo {
len: usize,
) -> (Vec<i64>, Vec<bool>) {
match array.data_type() {
- DataType::Null
- | DataType::Boolean
+ // A NullArray is entirely nulls, despite not containing a null
buffer
+ DataType::Null => ((0..=(len as i64)).collect(), vec![false; len]),
+ DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32