This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 6bf3b3a  Fix reading/writing nested null arrays (#1480) (#1036) 
(#1399) (#1481)
6bf3b3a is described below

commit 6bf3b3af5585c50026a314c26fdaf5b9b29d01e0
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Fri Mar 25 16:43:52 2022 +0000

    Fix reading/writing nested null arrays (#1480) (#1036) (#1399) (#1481)
---
 parquet/src/arrow/array_reader.rs |  7 +++--
 parquet/src/arrow/arrow_writer.rs | 56 +++++++++++++++++++++++++++++++++++++++
 parquet/src/arrow/levels.rs       | 10 +++----
 3 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/parquet/src/arrow/array_reader.rs 
b/parquet/src/arrow/array_reader.rs
index 9c016e7..756dffc 100644
--- a/parquet/src/arrow/array_reader.rs
+++ b/parquet/src/arrow/array_reader.rs
@@ -215,11 +215,10 @@ where
 
     /// Reads at most `batch_size` records into array.
     fn next_batch(&mut self, batch_size: usize) -> Result<ArrayRef> {
-        let records_read =
-            read_records(&mut self.record_reader, self.pages.as_mut(), 
batch_size)?;
+        read_records(&mut self.record_reader, self.pages.as_mut(), 
batch_size)?;
 
         // convert to arrays
-        let array = arrow::array::NullArray::new(records_read);
+        let array = 
arrow::array::NullArray::new(self.record_reader.num_values());
 
         // save definition and repetition buffers
         self.def_levels_buffer = self.record_reader.consume_def_levels()?;
@@ -877,7 +876,7 @@ fn remove_indices(
                 Ok(Arc::new(StructArray::from((new_columns, valid.finish()))))
             }
         }
-        ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len()))),
+        ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len() - 
indices.len()))),
         _ => Err(ParquetError::General(format!(
             "ListArray of type List({:?}) is not supported by array_reader",
             item_type
diff --git a/parquet/src/arrow/arrow_writer.rs 
b/parquet/src/arrow/arrow_writer.rs
index 6a9310d..3a53c2c 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -1514,6 +1514,43 @@ mod tests {
     }
 
     #[test]
+    fn null_list_single_column() {
+        let null_field = Field::new("item", DataType::Null, true);
+        let list_field =
+            Field::new("emptylist", DataType::List(Box::new(null_field)), 
true);
+
+        let schema = Schema::new(vec![list_field]);
+
+        // Build [[], null, [null, null]]
+        let a_values = NullArray::new(2);
+        let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0, 0, 
2].to_byte_slice());
+        let a_list_data = 
ArrayData::builder(DataType::List(Box::new(Field::new(
+            "item",
+            DataType::Null,
+            true,
+        ))))
+        .len(3)
+        .add_buffer(a_value_offsets)
+        .null_bit_buffer(Buffer::from(vec![0b00000101]))
+        .add_child_data(a_values.data().clone())
+        .build()
+        .unwrap();
+
+        let a = ListArray::from(a_list_data);
+
+        assert!(a.is_valid(0));
+        assert!(!a.is_valid(1));
+        assert!(a.is_valid(2));
+
+        assert_eq!(a.value(0).len(), 0);
+        assert_eq!(a.value(2).len(), 2);
+        assert_eq!(a.value(2).null_count(), 2);
+
+        let batch = RecordBatch::try_new(Arc::new(schema), 
vec![Arc::new(a)]).unwrap();
+        roundtrip(batch, None);
+    }
+
+    #[test]
     fn list_single_column() {
         let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
         let a_value_offsets =
@@ -1565,6 +1602,25 @@ mod tests {
     }
 
     #[test]
+    fn list_nested_nulls() {
+        use arrow::datatypes::Int32Type;
+        let data = vec![
+            Some(vec![Some(1)]),
+            Some(vec![Some(2), Some(3)]),
+            None,
+            Some(vec![Some(4), Some(5), None]),
+            Some(vec![None]),
+            Some(vec![Some(6), Some(7)]),
+        ];
+
+        let list = ListArray::from_iter_primitive::<Int32Type, _, 
_>(data.clone());
+        one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));
+
+        let list = LargeListArray::from_iter_primitive::<Int32Type, _, 
_>(data);
+        one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2));
+    }
+
+    #[test]
     fn struct_single_column() {
         let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
         let struct_field_a = Field::new("f", DataType::Int32, false);
diff --git a/parquet/src/arrow/levels.rs b/parquet/src/arrow/levels.rs
index 8c92405..a1979e5 100644
--- a/parquet/src/arrow/levels.rs
+++ b/parquet/src/arrow/levels.rs
@@ -200,9 +200,8 @@ impl LevelInfo {
                 );
 
                 match child_array.data_type() {
-                    // TODO: The behaviour of a <list<null>> is untested
-                    DataType::Null => vec![list_level],
-                    DataType::Boolean
+                    DataType::Null
+                    | DataType::Boolean
                     | DataType::Int8
                     | DataType::Int16
                     | DataType::Int32
@@ -677,8 +676,9 @@ impl LevelInfo {
         len: usize,
     ) -> (Vec<i64>, Vec<bool>) {
         match array.data_type() {
-            DataType::Null
-            | DataType::Boolean
+            // A NullArray is entirely nulls, despite not containing a null 
buffer
+            DataType::Null => ((0..=(len as i64)).collect(), vec![false; len]),
+            DataType::Boolean
             | DataType::Int8
             | DataType::Int16
             | DataType::Int32

Reply via email to