This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e898de5 JSON reader - empty nested list should not create child value
(#826)
e898de5 is described below
commit e898de57e4587c64387939f8a557bc5fa2dffeb8
Author: Wakahisa <[email protected]>
AuthorDate: Wed Oct 13 15:46:07 2021 +0200
JSON reader - empty nested list should not create child value (#826)
* JSON reader - empty nested list should not create child value
* PR review
---
arrow/src/json/reader.rs | 41 ++++++++++++++++++--------------------
arrow/src/json/writer.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 71 insertions(+), 22 deletions(-)
diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs
index 9592b59..c2a2de9 100644
--- a/arrow/src/json/reader.rs
+++ b/arrow/src/json/reader.rs
@@ -1048,31 +1048,27 @@ impl Decoder {
}
DataType::Struct(fields) => {
// extract list values, with non-lists converted to Value::Null
- let array_item_count = rows
- .iter()
- .map(|row| match row {
- Value::Array(values) => values.len(),
- _ => 1,
- })
- .sum();
+ let array_item_count = cur_offset.to_usize().unwrap();
let num_bytes = bit_util::ceil(array_item_count, 8);
let mut null_buffer =
MutableBuffer::from_len_zeroed(num_bytes);
let mut struct_index = 0;
let rows: Vec<Value> = rows
.iter()
- .flat_map(|row| {
- if let Value::Array(values) = row {
- values.iter().for_each(|_| {
- bit_util::set_bit(
- null_buffer.as_slice_mut(),
- struct_index,
- );
+ .flat_map(|row| match row {
+ Value::Array(values) if !values.is_empty() => {
+ values.iter().for_each(|value| {
+ if !value.is_null() {
+ bit_util::set_bit(
+ null_buffer.as_slice_mut(),
+ struct_index,
+ );
+ }
struct_index += 1;
});
values.clone()
- } else {
- struct_index += 1;
- vec![Value::Null]
+ }
+ _ => {
+ vec![]
}
})
.collect();
@@ -2209,6 +2205,7 @@ mod tests {
{"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d":
"d_text"}}, {"b": true, "c": {"d": null}}]}
{"a": null}
{"a": []}
+ {"a": [null]}
"#;
let mut reader = builder.build(Cursor::new(json_content)).unwrap();
@@ -2243,23 +2240,23 @@ mod tests {
.null_bit_buffer(Buffer::from(vec![0b00111111]))
.build();
let a_list = ArrayDataBuilder::new(a_field.data_type().clone())
- .len(5)
- .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6]))
+ .len(6)
+ .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7]))
.add_child_data(a)
- .null_bit_buffer(Buffer::from(vec![0b00010111]))
+ .null_bit_buffer(Buffer::from(vec![0b00110111]))
.build();
let expected = make_array(a_list);
// compare `a` with result from json reader
let batch = reader.next().unwrap().unwrap();
let read = batch.column(0);
- assert_eq!(read.len(), 5);
+ assert_eq!(read.len(), 6);
// compare the arrays the long way around, to better detect differences
let read: &ListArray =
read.as_any().downcast_ref::<ListArray>().unwrap();
let expected = expected.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(
read.data().buffers()[0],
- Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6])
+ Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7])
);
// compare list null buffers
assert_eq!(read.data().null_buffer(), expected.data().null_buffer());
diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs
index eb1f79f..52ef945 100644
--- a/arrow/src/json/writer.rs
+++ b/arrow/src/json/writer.rs
@@ -1259,4 +1259,56 @@ mod tests {
r#"[{"an":"object"},{"another":"object"}]"#
);
}
+
+ #[test]
+ fn json_list_roundtrip() {
+ let json_content = r#"
+ {"list": [{"ints": 1}]}
+ {"list": [{}]}
+ {"list": []}
+ {"list": null}
+ {"list": [{"ints": null}]}
+ {"list": [null]}
+ "#;
+ let ints_struct =
+ DataType::Struct(vec![Field::new("ints", DataType::Int32, true)]);
+ let list_type = DataType::List(Box::new(Field::new("item",
ints_struct, true)));
+ let list_field = Field::new("list", list_type, true);
+ let schema = Arc::new(Schema::new(vec![list_field]));
+ let builder =
ReaderBuilder::new().with_schema(schema).with_batch_size(64);
+ let mut reader =
builder.build(std::io::Cursor::new(json_content)).unwrap();
+
+ let batch = reader.next().unwrap().unwrap();
+
+ let list_row = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<ListArray>()
+ .unwrap();
+ let values = list_row.values();
+ assert_eq!(values.len(), 4);
+ assert_eq!(values.null_count(), 1);
+
+ // write the batch to JSON, and compare output with input
+ let mut buf = Vec::new();
+ {
+ let mut writer = LineDelimitedWriter::new(&mut buf);
+ writer.write_batches(&[batch]).unwrap();
+ }
+
+ // NOTE: The last value should technically be {"list": [null]} but it
appears
+ // that implementations differ on the treatment of a null struct.
+ // It would be more accurate to return a null struct, so this can be
done
+ // as a follow up.
+ assert_eq!(
+ String::from_utf8(buf).unwrap(),
+ r#"{"list":[{"ints":1}]}
+{"list":[{}]}
+{"list":[]}
+{}
+{"list":[{}]}
+{"list":[{}]}
+"#
+ );
+ }
}