This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e898de5  JSON reader - empty nested list should not create child value 
(#826)
e898de5 is described below

commit e898de57e4587c64387939f8a557bc5fa2dffeb8
Author: Wakahisa <[email protected]>
AuthorDate: Wed Oct 13 15:46:07 2021 +0200

    JSON reader - empty nested list should not create child value (#826)
    
    * JSON reader - empty nested list should not create child value
    
    * PR review
---
 arrow/src/json/reader.rs | 41 ++++++++++++++++++--------------------
 arrow/src/json/writer.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs
index 9592b59..c2a2de9 100644
--- a/arrow/src/json/reader.rs
+++ b/arrow/src/json/reader.rs
@@ -1048,31 +1048,27 @@ impl Decoder {
             }
             DataType::Struct(fields) => {
                 // extract list values, with non-lists converted to Value::Null
-                let array_item_count = rows
-                    .iter()
-                    .map(|row| match row {
-                        Value::Array(values) => values.len(),
-                        _ => 1,
-                    })
-                    .sum();
+                let array_item_count = cur_offset.to_usize().unwrap();
                 let num_bytes = bit_util::ceil(array_item_count, 8);
                 let mut null_buffer = 
MutableBuffer::from_len_zeroed(num_bytes);
                 let mut struct_index = 0;
                 let rows: Vec<Value> = rows
                     .iter()
-                    .flat_map(|row| {
-                        if let Value::Array(values) = row {
-                            values.iter().for_each(|_| {
-                                bit_util::set_bit(
-                                    null_buffer.as_slice_mut(),
-                                    struct_index,
-                                );
+                    .flat_map(|row| match row {
+                        Value::Array(values) if !values.is_empty() => {
+                            values.iter().for_each(|value| {
+                                if !value.is_null() {
+                                    bit_util::set_bit(
+                                        null_buffer.as_slice_mut(),
+                                        struct_index,
+                                    );
+                                }
                                 struct_index += 1;
                             });
                             values.clone()
-                        } else {
-                            struct_index += 1;
-                            vec![Value::Null]
+                        }
+                        _ => {
+                            vec![]
                         }
                     })
                     .collect();
@@ -2209,6 +2205,7 @@ mod tests {
         {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": 
"d_text"}}, {"b": true, "c": {"d": null}}]}
         {"a": null}
         {"a": []}
+        {"a": [null]}
         "#;
         let mut reader = builder.build(Cursor::new(json_content)).unwrap();
 
@@ -2243,23 +2240,23 @@ mod tests {
             .null_bit_buffer(Buffer::from(vec![0b00111111]))
             .build();
         let a_list = ArrayDataBuilder::new(a_field.data_type().clone())
-            .len(5)
-            .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6]))
+            .len(6)
+            .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7]))
             .add_child_data(a)
-            .null_bit_buffer(Buffer::from(vec![0b00010111]))
+            .null_bit_buffer(Buffer::from(vec![0b00110111]))
             .build();
         let expected = make_array(a_list);
 
         // compare `a` with result from json reader
         let batch = reader.next().unwrap().unwrap();
         let read = batch.column(0);
-        assert_eq!(read.len(), 5);
+        assert_eq!(read.len(), 6);
         // compare the arrays the long way around, to better detect differences
         let read: &ListArray = 
read.as_any().downcast_ref::<ListArray>().unwrap();
         let expected = expected.as_any().downcast_ref::<ListArray>().unwrap();
         assert_eq!(
             read.data().buffers()[0],
-            Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6])
+            Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7])
         );
         // compare list null buffers
         assert_eq!(read.data().null_buffer(), expected.data().null_buffer());
diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs
index eb1f79f..52ef945 100644
--- a/arrow/src/json/writer.rs
+++ b/arrow/src/json/writer.rs
@@ -1259,4 +1259,56 @@ mod tests {
             r#"[{"an":"object"},{"another":"object"}]"#
         );
     }
+
+    #[test]
+    fn json_list_roundtrip() {
+        let json_content = r#"
+        {"list": [{"ints": 1}]}
+        {"list": [{}]}
+        {"list": []}
+        {"list": null}
+        {"list": [{"ints": null}]}
+        {"list": [null]}
+        "#;
+        let ints_struct =
+            DataType::Struct(vec![Field::new("ints", DataType::Int32, true)]);
+        let list_type = DataType::List(Box::new(Field::new("item", 
ints_struct, true)));
+        let list_field = Field::new("list", list_type, true);
+        let schema = Arc::new(Schema::new(vec![list_field]));
+        let builder = 
ReaderBuilder::new().with_schema(schema).with_batch_size(64);
+        let mut reader = 
builder.build(std::io::Cursor::new(json_content)).unwrap();
+
+        let batch = reader.next().unwrap().unwrap();
+
+        let list_row = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<ListArray>()
+            .unwrap();
+        let values = list_row.values();
+        assert_eq!(values.len(), 4);
+        assert_eq!(values.null_count(), 1);
+
+        // write the batch to JSON, and compare output with input
+        let mut buf = Vec::new();
+        {
+            let mut writer = LineDelimitedWriter::new(&mut buf);
+            writer.write_batches(&[batch]).unwrap();
+        }
+
+        // NOTE: The last value should technically be {"list": [null]} but it 
appears
+        // that implementations differ on the treatment of a null struct.
+        // It would be more accurate to return a null struct, so this can be 
done
+        // as a follow up.
+        assert_eq!(
+            String::from_utf8(buf).unwrap(),
+            r#"{"list":[{"ints":1}]}
+{"list":[{}]}
+{"list":[]}
+{}
+{"list":[{}]}
+{"list":[{}]}
+"#
+        );
+    }
 }

Reply via email to