This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 3f0963d020 Improve documentation on how to build `ScalarValue::Struct` 
and add `ScalarStructBuilder` (#9229)
3f0963d020 is described below

commit 3f0963d020f308aa3168502830ccbe6bf7ba69b7
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Feb 15 07:25:47 2024 -0500

    Improve documentation on how to build `ScalarValue::Struct` and add 
`ScalarStructBuilder` (#9229)
    
    * Improve documentation on how to build `ScalarValue::Struct` and add 
`ScalarStructBuilder`
    
    * Update datafusion/common/src/scalar/struct_builder.rs
    
    * Improved docs
    
    * update test
    
    ---------
    
    Co-authored-by: comphead <[email protected]>
---
 datafusion/common/src/{scalar.rs => scalar/mod.rs} | 181 ++++++++++++++-------
 datafusion/common/src/scalar/struct_builder.rs     | 152 +++++++++++++++++
 .../proto/tests/cases/roundtrip_logical_plan.rs    |  20 +--
 3 files changed, 282 insertions(+), 71 deletions(-)

diff --git a/datafusion/common/src/scalar.rs 
b/datafusion/common/src/scalar/mod.rs
similarity index 98%
rename from datafusion/common/src/scalar.rs
rename to datafusion/common/src/scalar/mod.rs
index 2395f8acc4..29107ab10e 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -15,7 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! This module provides ScalarValue, an enum that can be used for storage of 
single elements
+//! [`ScalarValue`]: stores single  values
+
+mod struct_builder;
 
 use std::borrow::Borrow;
 use std::cmp::Ordering;
@@ -43,24 +45,45 @@ use arrow::{
     compute::kernels::cast::{cast_with_options, CastOptions},
     datatypes::{
         i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, 
DataType,
-        Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
+        Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
         IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-        IntervalYearMonthType, SchemaBuilder, TimeUnit, 
TimestampMicrosecondType,
+        IntervalYearMonthType, TimeUnit, TimestampMicrosecondType,
         TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
         UInt16Type, UInt32Type, UInt64Type, UInt8Type, 
DECIMAL128_MAX_PRECISION,
     },
 };
 use arrow_array::cast::as_list_array;
 use arrow_array::{ArrowNativeTypeOp, Scalar};
-use arrow_buffer::NullBuffer;
 
-/// A dynamically typed, nullable single value, (the single-valued counter-part
-/// to arrow's [`Array`])
+pub use struct_builder::ScalarStructBuilder;
+
+/// A dynamically typed, nullable single value.
+///
+/// While an arrow  [`Array`]) stores one or more values of the same type, in a
+/// single column, a `ScalarValue` stores a single value of a single type, the
+/// equivalent of 1 row and one column.
+///
+/// ```text
+///  ┌────────┐
+///  │ value1 │
+///  │ value2 │                  ┌────────┐
+///  │ value3 │                  │ value2 │
+///  │  ...   │                  └────────┘
+///  │ valueN │
+///  └────────┘
+///
+///    Array                     ScalarValue
+///
+/// stores multiple,             stores a single,
+/// possibly null, values of     possible null, value
+/// the same type
+/// ```
 ///
 /// # Performance
 ///
-/// In general, please use arrow [`Array`]s rather than [`ScalarValue`] 
whenever
-/// possible, as it is far more efficient for multiple values.
+/// In general, performance will be better using arrow [`Array`]s rather than
+/// [`ScalarValue`], as it is far more efficient to process multiple values at
+/// once (vecctorized processing).
 ///
 /// # Example
 /// ```
@@ -99,6 +122,66 @@ use arrow_buffer::NullBuffer;
 /// # }
 /// ```
 ///
+/// # Nested Types
+///
+/// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a
+/// single element array of the corresponding type.
+///
+/// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field};
+/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
+/// // Build a struct like: {a: 1, b: "foo"}
+/// let field_a = Field::new("a", DataType::Int32, false);
+/// let field_b = Field::new("b", DataType::Utf8, false);
+///
+/// let s1 = ScalarStructBuilder::new()
+///    .with_scalar(field_a, ScalarValue::from(1i32))
+///    .with_scalar(field_b, ScalarValue::from("foo"))
+///    .build();
+/// ```
+///
+/// ## Example: Creating a null [`ScalarValue::Struct`] using 
[`ScalarStructBuilder`]
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field};
+/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
+/// // Build a struct representing a NULL value
+/// let fields = vec![
+///     Field::new("a", DataType::Int32, false),
+///     Field::new("b", DataType::Utf8, false),
+/// ];
+///
+/// let s1 = ScalarStructBuilder::new_null(fields);
+/// ```
+///
+/// ## Example: Creating [`ScalarValue::Struct`] directly
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field, Fields};
+/// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray};
+/// # use datafusion_common::ScalarValue;
+/// // Build a struct like: {a: 1, b: "foo"}
+/// // Field description
+/// let fields = Fields::from(vec![
+///   Field::new("a", DataType::Int32, false),
+///   Field::new("b", DataType::Utf8, false),
+/// ]);
+/// // one row arrays for each field
+/// let arrays: Vec<ArrayRef> = vec![
+///   Arc::new(Int32Array::from(vec![1])),
+///   Arc::new(StringArray::from(vec!["foo"])),
+/// ];
+/// // no nulls for this array
+/// let nulls = None;
+/// let arr = StructArray::new(fields, arrays, nulls);
+///
+/// // Create a ScalarValue::Struct directly
+/// let s1 = ScalarValue::Struct(Arc::new(arr));
+/// ```
+///
+///
 /// # Further Reading
 /// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) 
for
 /// details on datatypes and the 
[format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375)
@@ -153,7 +236,8 @@ pub enum ScalarValue {
     List(Arc<ListArray>),
     /// The array must be a LargeListArray with length 1.
     LargeList(Arc<LargeListArray>),
-    /// Represents a single element of a [`StructArray`] as an [`ArrayRef`]
+    /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See
+    /// [`ScalarValue`] for examples of how to create instances of this type.
     Struct(Arc<StructArray>),
     /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01
     Date32(Option<i32>),
@@ -2679,20 +2763,13 @@ impl From<Option<&str>> for ScalarValue {
 /// Wrapper to create ScalarValue::Struct for convenience
 impl From<Vec<(&str, ScalarValue)>> for ScalarValue {
     fn from(value: Vec<(&str, ScalarValue)>) -> Self {
-        let (fields, scalars): (SchemaBuilder, Vec<_>) = value
-            .into_iter()
-            .map(|(name, scalar)| (Field::new(name, scalar.data_type(), 
false), scalar))
-            .unzip();
-
-        let arrays = scalars
+        value
             .into_iter()
-            .map(|scalar| scalar.to_array().unwrap())
-            .collect::<Vec<ArrayRef>>();
-
-        let fields = fields.finish().fields;
-        let struct_array = StructArray::try_new(fields, arrays, None).unwrap();
-
-        Self::Struct(Arc::new(struct_array))
+            .fold(ScalarStructBuilder::new(), |builder, (name, value)| {
+                builder.with_name_and_scalar(name, value)
+            })
+            .build()
+            .unwrap()
     }
 }
 
@@ -2710,27 +2787,6 @@ impl From<String> for ScalarValue {
     }
 }
 
-// TODO: Remove this after changing to Scalar<T>
-// Wrapper for ScalarValue::Struct that checks the length of the arrays, 
without nulls
-impl From<(Fields, Vec<ArrayRef>)> for ScalarValue {
-    fn from((fields, arrays): (Fields, Vec<ArrayRef>)) -> Self {
-        Self::from((fields, arrays, None))
-    }
-}
-
-// TODO: Remove this after changing to Scalar<T>
-// Wrapper for ScalarValue::Struct that checks the length of the arrays
-impl From<(Fields, Vec<ArrayRef>, Option<NullBuffer>)> for ScalarValue {
-    fn from(
-        (fields, arrays, nulls): (Fields, Vec<ArrayRef>, Option<NullBuffer>),
-    ) -> Self {
-        for arr in arrays.iter() {
-            assert_eq!(arr.len(), 1);
-        }
-        Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls)))
-    }
-}
-
 macro_rules! impl_try_from {
     ($SCALAR:ident, $NATIVE:ident) => {
         impl TryFrom<ScalarValue> for $NATIVE {
@@ -3247,6 +3303,7 @@ mod tests {
     use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType};
     use arrow::util::pretty::pretty_format_columns;
     use arrow_buffer::Buffer;
+    use arrow_schema::Fields;
     use chrono::NaiveDate;
     use rand::Rng;
 
@@ -3266,31 +3323,33 @@ mod tests {
             ),
         ]);
 
-        let arrays = vec![boolean as ArrayRef, int as ArrayRef];
-        let fields = Fields::from(vec![
-            Field::new("b", DataType::Boolean, false),
-            Field::new("c", DataType::Int32, false),
-        ]);
-        let sv = ScalarValue::from((fields, arrays));
+        let sv = ScalarStructBuilder::new()
+            .with_array(Field::new("b", DataType::Boolean, false), boolean)
+            .with_array(Field::new("c", DataType::Int32, false), int)
+            .build()
+            .unwrap();
+
         let struct_arr = sv.to_array().unwrap();
         let actual = as_struct_array(&struct_arr).unwrap();
         assert_eq!(actual, &expected);
     }
 
     #[test]
-    #[should_panic(expected = "assertion `left == right` failed")]
+    #[should_panic(
+        expected = "Error building ScalarValue::Struct. Expected array with 
exactly one element, found array with 4 elements"
+    )]
     fn test_scalar_value_from_for_struct_should_panic() {
-        let fields = Fields::from(vec![
-            Field::new("bool", DataType::Boolean, false),
-            Field::new("i32", DataType::Int32, false),
-        ]);
-
-        let arrays = vec![
-            Arc::new(BooleanArray::from(vec![false, true, false, false])) as 
ArrayRef,
-            Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
-        ];
-
-        let _ = ScalarValue::from((fields, arrays));
+        let _ = ScalarStructBuilder::new()
+            .with_array(
+                Field::new("bool", DataType::Boolean, false),
+                Arc::new(BooleanArray::from(vec![false, true, false, false])),
+            )
+            .with_array(
+                Field::new("i32", DataType::Int32, false),
+                Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
+            )
+            .build()
+            .unwrap();
     }
 
     #[test]
diff --git a/datafusion/common/src/scalar/struct_builder.rs 
b/datafusion/common/src/scalar/struct_builder.rs
new file mode 100644
index 0000000000..926e100417
--- /dev/null
+++ b/datafusion/common/src/scalar/struct_builder.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarStructBuilder`] for building [`ScalarValue::Struct`]
+
+use crate::error::_internal_err;
+use crate::{DataFusionError, Result, ScalarValue};
+use arrow::array::{ArrayRef, StructArray};
+use arrow::datatypes::{DataType, FieldRef, Fields};
+use arrow_schema::Field;
+use std::sync::Arc;
+
+/// Builder for [`ScalarValue::Struct`].
+///
+/// See examples on [`ScalarValue`]
+#[derive(Debug, Default)]
+pub struct ScalarStructBuilder {
+    fields: Vec<FieldRef>,
+    arrays: Vec<ArrayRef>,
+}
+
+impl ScalarStructBuilder {
+    /// Create a new `ScalarStructBuilder`
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Return a new [`ScalarValue::Struct`] with the specified fields and a
+    /// single null value
+    pub fn new_null(fields: impl IntoFields) -> ScalarValue {
+        DataType::Struct(fields.into()).try_into().unwrap()
+    }
+
+    /// Add the specified field and [`ArrayRef`] to the struct.
+    ///
+    /// Note the array should have a single row.
+    pub fn with_array(mut self, field: impl IntoFieldRef, value: ArrayRef) -> 
Self {
+        self.fields.push(field.into_field_ref());
+        self.arrays.push(value);
+        self
+    }
+
+    /// Add the specified field and `ScalarValue` to the struct.
+    pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> 
Self {
+        // valid scalar value should not fail
+        let array = value.to_array().unwrap();
+        self.with_array(field, array)
+    }
+
+    /// Add a field with the specified name and value to the struct.
+    /// the field is created with the specified data type and as non nullable
+    pub fn with_name_and_scalar(self, name: &str, value: ScalarValue) -> Self {
+        let field = Field::new(name, value.data_type(), false);
+        self.with_scalar(field, value)
+    }
+
+    /// Return a [`ScalarValue::Struct`] with the fields and values added so 
far
+    ///
+    /// # Errors
+    ///
+    /// If the [`StructArray`] cannot be created (for example if there is a
+    /// mismatch between field types and arrays) or the arrays do not have
+    /// exactly one element.
+    pub fn build(self) -> Result<ScalarValue> {
+        let Self { fields, arrays } = self;
+
+        for array in &arrays {
+            if array.len() != 1 {
+                return _internal_err!(
+                    "Error building ScalarValue::Struct. \
+                Expected array with exactly one element, found array with {} 
elements",
+                    array.len()
+                );
+            }
+        }
+
+        let struct_array = StructArray::try_new(Fields::from(fields), arrays, 
None)?;
+        Ok(ScalarValue::Struct(Arc::new(struct_array)))
+    }
+}
+
+/// Trait for converting a type into a [`FieldRef`]
+///
+/// Used to avoid having to call `clone()` on a `FieldRef` when adding a field 
to
+/// a `ScalarStructBuilder`.
+///
+/// TODO potentially upstream this to arrow-rs so that we can
+/// use impl `Into<FieldRef>` instead
+pub trait IntoFieldRef {
+    fn into_field_ref(self) -> FieldRef;
+}
+
+impl IntoFieldRef for FieldRef {
+    fn into_field_ref(self) -> FieldRef {
+        self
+    }
+}
+
+impl IntoFieldRef for &FieldRef {
+    fn into_field_ref(self) -> FieldRef {
+        self.clone()
+    }
+}
+
+impl IntoFieldRef for Field {
+    fn into_field_ref(self) -> FieldRef {
+        FieldRef::new(self)
+    }
+}
+
+/// Trait for converting a type into a [`Fields`]
+///
+/// This avoids to avoid having to call clone() on an Arc'd `Fields` when 
adding
+/// a field to a `ScalarStructBuilder`
+///
+/// TODO potentially upstream this to arrow-rs so that we can
+/// use impl `Into<Fields>` instead
+pub trait IntoFields {
+    fn into(self) -> Fields;
+}
+
+impl IntoFields for Fields {
+    fn into(self) -> Fields {
+        self
+    }
+}
+
+impl IntoFields for &Fields {
+    fn into(self) -> Fields {
+        self.clone()
+    }
+}
+
+impl IntoFields for Vec<Field> {
+    fn into(self) -> Fields {
+        Fields::from(self)
+    }
+}
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs 
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index b6d288da2c..68a318b5a6 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -21,7 +21,6 @@ use std::fmt::{self, Debug, Formatter};
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, FixedSizeListArray};
-use arrow::array::{BooleanArray, Int32Array};
 use arrow::csv::WriterBuilder;
 use arrow::datatypes::{
     DataType, Field, Fields, Int32Type, IntervalDayTimeType, 
IntervalMonthDayNanoType,
@@ -42,6 +41,7 @@ use 
datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::file_options::parquet_writer::ParquetWriterOptions;
 use datafusion_common::file_options::StatementOptions;
 use datafusion_common::parsers::CompressionTypeVariant;
+use datafusion_common::scalar::ScalarStructBuilder;
 use datafusion_common::{internal_err, not_impl_err, plan_err, 
FileTypeWriterOptions};
 use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, 
ScalarValue};
 use datafusion_common::{FileType, Result};
@@ -932,17 +932,17 @@ fn round_trip_scalar_values() {
         ScalarValue::Binary(None),
         ScalarValue::LargeBinary(Some(b"bar".to_vec())),
         ScalarValue::LargeBinary(None),
-        ScalarValue::from((
-            vec![
+        ScalarStructBuilder::new()
+            .with_scalar(
                 Field::new("a", DataType::Int32, true),
+                ScalarValue::from(23i32),
+            )
+            .with_scalar(
                 Field::new("b", DataType::Boolean, false),
-            ]
-            .into(),
-            vec![
-                Arc::new(Int32Array::from(vec![Some(23)])) as ArrayRef,
-                Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef,
-            ],
-        )),
+                ScalarValue::from(false),
+            )
+            .build()
+            .unwrap(),
         ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
             Field::new("a", DataType::Int32, true),
             Field::new("b", DataType::Boolean, false),

Reply via email to