alamb commented on a change in pull request #1142:
URL: https://github.com/apache/arrow-datafusion/pull/1142#discussion_r732202212
##########
File path: datafusion/src/scalar.rs
##########
@@ -833,6 +838,74 @@ impl ScalarValue {
Ok(array)
}
+ fn iter_to_array_list(
+ scalars: impl IntoIterator<Item = ScalarValue>,
+ data_type: &DataType,
+ ) -> Result<GenericListArray<i32>> {
+ let mut offsets = Int32Array::builder(0);
+ if let Err(err) = offsets.append_value(0) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ let mut elements: Vec<ArrayRef> = Vec::new();
+ let mut valid = BooleanBufferBuilder::new(0);
+ let mut flat_len = 0i32;
+ for scalar in scalars {
+ if let ScalarValue::List(values, _) = scalar {
+ match values {
+ Some(values) => {
+ let element_array =
+
ScalarValue::iter_to_array(values.as_ref().clone())?;
+
+ // Add new offset index
+ flat_len += element_array.len() as i32;
+ if let Err(err) = offsets.append_value(flat_len) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ elements.push(element_array);
+
+ // Element is valid
+ valid.append(true);
+ }
+ None => {
+ // Repeat previous offset index
+ if let Err(err) = offsets.append_value(flat_len) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ // Element is null
+ valid.append(false);
+ }
+ }
+ } else {
+ return Err(DataFusionError::Internal(format!(
+ "Expected ScalarValue::List element. Received {:?}",
+ scalar
+ )));
+ }
+ }
+
+ // Concatenate element arrays to create single flat array
+ let element_arrays: Vec<&dyn Array> =
+ elements.iter().map(|a| a.as_ref()).collect();
+ let flat_array = match arrow::compute::concat(&element_arrays) {
+ Ok(flat_array) => flat_array,
+ Err(err) => return Err(DataFusionError::ArrowError(err)),
+ };
+
+ // Build ListArray using ArrayData so we can specify a flat inner
array, and offset indices
Review comment:
Looks good 👍
##########
File path: datafusion/src/scalar.rs
##########
@@ -945,7 +1018,15 @@ impl ScalarValue {
&DataType::LargeUtf8 => {
build_list!(LargeStringBuilder, LargeUtf8, values, size)
}
- dt => panic!("Unexpected DataType for list {:?}", dt),
+ _ => ScalarValue::iter_to_array_list(
Review comment:
I wonder if we could remove all the other `DataType` matches above and
consolidate the code to use `iter_to_array_list` rather than `build_list!`
Though perhaps that code cleanup would lose some optimizations (aka would
concat a lot of small arrays) 🤔
##########
File path: datafusion/src/scalar.rs
##########
@@ -833,6 +838,74 @@ impl ScalarValue {
Ok(array)
}
+ fn iter_to_array_list(
+ scalars: impl IntoIterator<Item = ScalarValue>,
+ data_type: &DataType,
+ ) -> Result<GenericListArray<i32>> {
+ let mut offsets = Int32Array::builder(0);
+ if let Err(err) = offsets.append_value(0) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ let mut elements: Vec<ArrayRef> = Vec::new();
+ let mut valid = BooleanBufferBuilder::new(0);
+ let mut flat_len = 0i32;
+ for scalar in scalars {
+ if let ScalarValue::List(values, _) = scalar {
+ match values {
+ Some(values) => {
+ let element_array =
+
ScalarValue::iter_to_array(values.as_ref().clone())?;
+
+ // Add new offset index
+ flat_len += element_array.len() as i32;
+ if let Err(err) = offsets.append_value(flat_len) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ elements.push(element_array);
+
+ // Element is valid
+ valid.append(true);
+ }
+ None => {
+ // Repeat previous offset index
+ if let Err(err) = offsets.append_value(flat_len) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ // Element is null
+ valid.append(false);
+ }
+ }
+ } else {
+ return Err(DataFusionError::Internal(format!(
+ "Expected ScalarValue::List element. Received {:?}",
+ scalar
+ )));
+ }
+ }
+
+ // Concatenate element arrays to create single flat array
+ let element_arrays: Vec<&dyn Array> =
+ elements.iter().map(|a| a.as_ref()).collect();
+ let flat_array = match arrow::compute::concat(&element_arrays) {
+ Ok(flat_array) => flat_array,
+ Err(err) => return Err(DataFusionError::ArrowError(err)),
+ };
+
+ // Build ListArray using ArrayData so we can specify a flat inner
array, and offset indices
+ let offsets_array = offsets.finish();
+ let array_data = ArrayDataBuilder::new(data_type.clone())
+ .len(offsets_array.len() - 1)
+ .null_bit_buffer(valid.finish())
+ .add_buffer(offsets_array.data().buffers()[0].clone())
+ .add_child_data(flat_array.data().clone());
+
+ let list_array = ListArray::from(array_data.build());
Review comment:
When you rebase this PR, you will likely have to change this to
`array_data.build()?` as `build()` is fallible as of arrow-6.0.0
##########
File path: datafusion/src/scalar.rs
##########
@@ -833,6 +838,74 @@ impl ScalarValue {
Ok(array)
}
+ fn iter_to_array_list(
+ scalars: impl IntoIterator<Item = ScalarValue>,
+ data_type: &DataType,
+ ) -> Result<GenericListArray<i32>> {
+ let mut offsets = Int32Array::builder(0);
+ if let Err(err) = offsets.append_value(0) {
+ return Err(DataFusionError::ArrowError(err));
+ }
+
+ let mut elements: Vec<ArrayRef> = Vec::new();
+ let mut valid = BooleanBufferBuilder::new(0);
+ let mut flat_len = 0i32;
+ for scalar in scalars {
+ if let ScalarValue::List(values, _) = scalar {
+ match values {
+ Some(values) => {
+ let element_array =
+
ScalarValue::iter_to_array(values.as_ref().clone())?;
Review comment:
```suggestion
let element_array =
ScalarValue::iter_to_array(*value)?;
```
I think you can avoid the clone by unboxing the values (I definitely get
confused that a `*` deref of a box is how one unboxes values)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]