Dandandan commented on a change in pull request #8938:
URL: https://github.com/apache/arrow/pull/8938#discussion_r544587363
##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -888,265 +855,362 @@ impl Decoder {
))
}
- fn build_list_array<T: ArrowPrimitiveType>(
+ /// Build a nested GenericListArray from a list of unnested `Value`s
+ fn build_nested_list_array<OffsetSize: OffsetSizeTrait>(
&self,
rows: &[Value],
- col_name: &str,
- ) -> Result<ArrayRef>
- where
- T::Native: num::NumCast,
- {
- let values_builder: PrimitiveBuilder<T> =
PrimitiveBuilder::new(rows.len());
- let mut builder = ListBuilder::new(values_builder);
- for row in rows {
- if let Some(value) = row.get(&col_name) {
- // value can be an array or a scalar
- let vals: Vec<Option<f64>> = if let Value::Number(value) =
value {
- vec![value.as_f64()]
- } else if let Value::Array(n) = value {
- n.iter().map(|v: &Value| v.as_f64()).collect()
- } else if let Value::Null = value {
- vec![None]
- } else {
- return Err(ArrowError::JsonError(
- "3Only scalars are currently supported in JSON arrays"
- .to_string(),
- ));
- };
- for val in vals {
- match val {
- Some(v) => match num::cast::cast(v) {
- Some(v) => builder.values().append_value(v)?,
- None => builder.values().append_null()?,
- },
- None => builder.values().append_null()?,
- };
- }
+ list_field: &Field,
+ ) -> Result<ArrayRef> {
+ // build list offsets
+ let mut cur_offset = OffsetSize::zero();
+ let list_len = rows.len();
+ let num_list_bytes = bit_util::ceil(list_len, 8);
+ let mut offsets = Vec::with_capacity(list_len + 1);
+ let mut list_nulls =
+ MutableBuffer::new(num_list_bytes).with_bitset(num_list_bytes,
false);
+ offsets.push(cur_offset);
+ rows.iter().enumerate().for_each(|(i, v)| {
+ if let Value::Array(a) = v {
+ cur_offset = cur_offset +
OffsetSize::from_usize(a.len()).unwrap();
+ bit_util::set_bit(list_nulls.data_mut(), i);
+ } else if let Value::Null = v {
+ // value is null, not incremented
+ } else {
+ cur_offset = cur_offset + OffsetSize::one();
}
- builder.append(true)?
- }
- Ok(Arc::new(builder.finish()))
+ offsets.push(cur_offset);
+ });
+ let valid_len = cur_offset.to_usize().unwrap();
+ let array_data = match list_field.data_type() {
+ DataType::Null => NullArray::new(valid_len).data(),
+ DataType::Boolean => {
+ let num_bytes = bit_util::ceil(valid_len, 8);
+ let mut bool_values =
+ MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+ let mut bool_nulls =
+ MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
+ let mut curr_index = 0;
+ rows.iter().for_each(|v| {
+ if let Value::Array(vs) = v {
+ vs.iter().for_each(|value| {
+ if let Value::Bool(child) = value {
+ // if valid boolean, append value
+ if *child {
+ bit_util::set_bit(bool_values.data_mut(),
curr_index);
+ }
+ } else {
+ // null slot
+ bit_util::unset_bit(bool_nulls.data_mut(),
curr_index);
+ }
+ curr_index += 1;
+ });
+ }
+ });
+ ArrayData::builder(list_field.data_type().clone())
+ .len(valid_len)
+ .add_buffer(bool_values.freeze())
+ .null_bit_buffer(bool_nulls.freeze())
+ .build()
+ }
+ DataType::Int8 =>
self.read_primitive_list_values::<Int8Type>(rows),
+ DataType::Int16 =>
self.read_primitive_list_values::<Int16Type>(rows),
+ DataType::Int32 =>
self.read_primitive_list_values::<Int32Type>(rows),
+ DataType::Int64 =>
self.read_primitive_list_values::<Int64Type>(rows),
+ DataType::UInt8 =>
self.read_primitive_list_values::<UInt8Type>(rows),
+ DataType::UInt16 =>
self.read_primitive_list_values::<UInt16Type>(rows),
+ DataType::UInt32 =>
self.read_primitive_list_values::<UInt32Type>(rows),
+ DataType::UInt64 =>
self.read_primitive_list_values::<UInt64Type>(rows),
+ DataType::Float16 => {
+ return Err(ArrowError::JsonError("Float16 not
supported".to_string()))
+ }
+ DataType::Float32 =>
self.read_primitive_list_values::<Float32Type>(rows),
+ DataType::Float64 =>
self.read_primitive_list_values::<Float64Type>(rows),
+ DataType::Timestamp(_, _)
+ | DataType::Date32(_)
+ | DataType::Date64(_)
+ | DataType::Time32(_)
+ | DataType::Time64(_) => {
+ return Err(ArrowError::JsonError(
+ "Temporal types are not yet supported, see
ARROW-4803".to_string(),
+ ))
+ }
+ DataType::Utf8 => {
+
StringArray::from_iter(flatten_json_string_values(rows).into_iter())
+ .data()
+ }
+ DataType::LargeUtf8 => {
+
LargeStringArray::from_iter(flatten_json_string_values(rows).into_iter())
+ .data()
+ }
+ DataType::List(field) => {
+ let child = self
+
.build_nested_list_array::<i32>(&flatten_json_values(rows), field)?;
+ child.data()
+ }
+ DataType::LargeList(field) => {
+ let child = self
+
.build_nested_list_array::<i64>(&flatten_json_values(rows), field)?;
+ child.data()
+ }
+ DataType::Struct(fields) => {
+ // extract list values, with non-lists converted to Value::Null
+ let len = rows.len();
+ let num_bytes = bit_util::ceil(len, 8);
+ let mut null_buffer =
+ MutableBuffer::new(num_bytes).with_bitset(num_bytes,
false);
+ let mut struct_index = 0;
+ let rows: Vec<Value> = rows
+ .iter()
+ .map(|row| {
+ if let Value::Array(values) = row {
+ values.iter().for_each(|_| {
+ bit_util::set_bit(null_buffer.data_mut(),
struct_index);
+ struct_index += 1;
+ });
+ values.clone()
+ } else {
+ struct_index += 1;
+ vec![Value::Null]
+ }
+ })
+ .flatten()
+ .collect();
+ let arrays =
+ self.build_struct_array(rows.as_slice(),
fields.as_slice(), &[])?;
+ let data_type = DataType::Struct(fields.clone());
+ let buf = null_buffer.freeze();
+ ArrayDataBuilder::new(data_type)
+ .len(rows.len())
+ .null_bit_buffer(buf)
+ .child_data(arrays.into_iter().map(|a| a.data()).collect())
+ .build()
+ }
+ DataType::Dictionary(_, _) => {
+ todo!()
+ }
+ t => {
Review comment:
Maybe call this binding `datatype`?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]