nevi-me commented on a change in pull request #8938:
URL: https://github.com/apache/arrow/pull/8938#discussion_r544606621



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -888,265 +855,362 @@ impl Decoder {
         ))
     }
 
-    fn build_list_array<T: ArrowPrimitiveType>(
+    /// Build a nested GenericListArray from a list of unnested `Value`s
+    fn build_nested_list_array<OffsetSize: OffsetSizeTrait>(
         &self,
         rows: &[Value],
-        col_name: &str,
-    ) -> Result<ArrayRef>
-    where
-        T::Native: num::NumCast,
-    {
-        let values_builder: PrimitiveBuilder<T> = 
PrimitiveBuilder::new(rows.len());
-        let mut builder = ListBuilder::new(values_builder);
-        for row in rows {
-            if let Some(value) = row.get(&col_name) {
-                // value can be an array or a scalar
-                let vals: Vec<Option<f64>> = if let Value::Number(value) = 
value {
-                    vec![value.as_f64()]
-                } else if let Value::Array(n) = value {
-                    n.iter().map(|v: &Value| v.as_f64()).collect()
-                } else if let Value::Null = value {
-                    vec![None]
-                } else {
-                    return Err(ArrowError::JsonError(
-                        "3Only scalars are currently supported in JSON arrays"
-                            .to_string(),
-                    ));
-                };
-                for val in vals {
-                    match val {
-                        Some(v) => match num::cast::cast(v) {
-                            Some(v) => builder.values().append_value(v)?,
-                            None => builder.values().append_null()?,
-                        },
-                        None => builder.values().append_null()?,
-                    };
-                }
+        list_field: &Field,
+    ) -> Result<ArrayRef> {
+        // build list offsets
+        let mut cur_offset = OffsetSize::zero();
+        let list_len = rows.len();
+        let num_list_bytes = bit_util::ceil(list_len, 8);
+        let mut offsets = Vec::with_capacity(list_len + 1);
+        let mut list_nulls =
+            MutableBuffer::new(num_list_bytes).with_bitset(num_list_bytes, 
false);
+        offsets.push(cur_offset);
+        rows.iter().enumerate().for_each(|(i, v)| {
+            if let Value::Array(a) = v {
+                cur_offset = cur_offset + 
OffsetSize::from_usize(a.len()).unwrap();
+                bit_util::set_bit(list_nulls.data_mut(), i);
+            } else if let Value::Null = v {
+                // value is null, not incremented
+            } else {
+                cur_offset = cur_offset + OffsetSize::one();
             }
-            builder.append(true)?
-        }
-        Ok(Arc::new(builder.finish()))
+            offsets.push(cur_offset);
+        });
+        let valid_len = cur_offset.to_usize().unwrap();
+        let array_data = match list_field.data_type() {
+            DataType::Null => NullArray::new(valid_len).data(),
+            DataType::Boolean => {
+                let num_bytes = bit_util::ceil(valid_len, 8);
+                let mut bool_values =
+                    MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
false);
+                let mut bool_nulls =
+                    MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
+                let mut curr_index = 0;
+                rows.iter().for_each(|v| {
+                    if let Value::Array(vs) = v {
+                        vs.iter().for_each(|value| {
+                            if let Value::Bool(child) = value {
+                                // if valid boolean, append value
+                                if *child {
+                                    bit_util::set_bit(bool_values.data_mut(), 
curr_index);
+                                }
+                            } else {
+                                // null slot
+                                bit_util::unset_bit(bool_nulls.data_mut(), 
curr_index);
+                            }
+                            curr_index += 1;
+                        });
+                    }
+                });
+                ArrayData::builder(list_field.data_type().clone())
+                    .len(valid_len)
+                    .add_buffer(bool_values.freeze())
+                    .null_bit_buffer(bool_nulls.freeze())
+                    .build()
+            }
+            DataType::Int8 => 
self.read_primitive_list_values::<Int8Type>(rows),
+            DataType::Int16 => 
self.read_primitive_list_values::<Int16Type>(rows),
+            DataType::Int32 => 
self.read_primitive_list_values::<Int32Type>(rows),
+            DataType::Int64 => 
self.read_primitive_list_values::<Int64Type>(rows),
+            DataType::UInt8 => 
self.read_primitive_list_values::<UInt8Type>(rows),
+            DataType::UInt16 => 
self.read_primitive_list_values::<UInt16Type>(rows),
+            DataType::UInt32 => 
self.read_primitive_list_values::<UInt32Type>(rows),
+            DataType::UInt64 => 
self.read_primitive_list_values::<UInt64Type>(rows),
+            DataType::Float16 => {
+                return Err(ArrowError::JsonError("Float16 not 
supported".to_string()))
+            }
+            DataType::Float32 => 
self.read_primitive_list_values::<Float32Type>(rows),
+            DataType::Float64 => 
self.read_primitive_list_values::<Float64Type>(rows),
+            DataType::Timestamp(_, _)
+            | DataType::Date32(_)
+            | DataType::Date64(_)
+            | DataType::Time32(_)
+            | DataType::Time64(_) => {
+                return Err(ArrowError::JsonError(
+                    "Temporal types are not yet supported, see 
ARROW-4803".to_string(),
+                ))
+            }
+            DataType::Utf8 => {
+                
StringArray::from_iter(flatten_json_string_values(rows).into_iter())
+                    .data()
+            }
+            DataType::LargeUtf8 => {
+                
LargeStringArray::from_iter(flatten_json_string_values(rows).into_iter())
+                    .data()
+            }
+            DataType::List(field) => {
+                let child = self
+                    
.build_nested_list_array::<i32>(&flatten_json_values(rows), field)?;
+                child.data()
+            }
+            DataType::LargeList(field) => {
+                let child = self
+                    
.build_nested_list_array::<i64>(&flatten_json_values(rows), field)?;
+                child.data()
+            }
+            DataType::Struct(fields) => {
+                // extract list values, with non-lists converted to Value::Null
+                let len = rows.len();
+                let num_bytes = bit_util::ceil(len, 8);
+                let mut null_buffer =
+                    MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
false);
+                let mut struct_index = 0;
+                let rows: Vec<Value> = rows
+                    .iter()
+                    .map(|row| {
+                        if let Value::Array(values) = row {
+                            values.iter().for_each(|_| {
+                                bit_util::set_bit(null_buffer.data_mut(), 
struct_index);
+                                struct_index += 1;

Review comment:
       I ended up being forced to use vec because I was getting panics from 
"Iterator must be sized". We use `ExactSizeIterator`, which doesn't work when 
flattening lists because the exact number of flattened values can't be 
determined.
   I'm not pleased with having to collect then iterate, but it was the only 
thing that seemed to work.
   
   I don't mind if you make some changes and push them onto my branch.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to