This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 3f0963d020 Improve documentation on how to build `ScalarValue::Struct`
and add `ScalarStructBuilder` (#9229)
3f0963d020 is described below
commit 3f0963d020f308aa3168502830ccbe6bf7ba69b7
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Feb 15 07:25:47 2024 -0500
Improve documentation on how to build `ScalarValue::Struct` and add
`ScalarStructBuilder` (#9229)
* Improve documentation on how to build `ScalarValue::Struct` and add
`ScalarStructBuilder`
* Update datafusion/common/src/scalar/struct_builder.rs
* Improved docs
* update test
---------
Co-authored-by: comphead <[email protected]>
---
datafusion/common/src/{scalar.rs => scalar/mod.rs} | 181 ++++++++++++++-------
datafusion/common/src/scalar/struct_builder.rs | 152 +++++++++++++++++
.../proto/tests/cases/roundtrip_logical_plan.rs | 20 +--
3 files changed, 282 insertions(+), 71 deletions(-)
diff --git a/datafusion/common/src/scalar.rs
b/datafusion/common/src/scalar/mod.rs
similarity index 98%
rename from datafusion/common/src/scalar.rs
rename to datafusion/common/src/scalar/mod.rs
index 2395f8acc4..29107ab10e 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.
-//! This module provides ScalarValue, an enum that can be used for storage of
single elements
+//! [`ScalarValue`]: stores single values
+
+mod struct_builder;
use std::borrow::Borrow;
use std::cmp::Ordering;
@@ -43,24 +45,45 @@ use arrow::{
compute::kernels::cast::{cast_with_options, CastOptions},
datatypes::{
i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType,
DataType,
- Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
+ Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type,
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
- IntervalYearMonthType, SchemaBuilder, TimeUnit,
TimestampMicrosecondType,
+ IntervalYearMonthType, TimeUnit, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
DECIMAL128_MAX_PRECISION,
},
};
use arrow_array::cast::as_list_array;
use arrow_array::{ArrowNativeTypeOp, Scalar};
-use arrow_buffer::NullBuffer;
-/// A dynamically typed, nullable single value, (the single-valued counter-part
-/// to arrow's [`Array`])
+pub use struct_builder::ScalarStructBuilder;
+
+/// A dynamically typed, nullable single value.
+///
+/// While an arrow [`Array`]) stores one or more values of the same type, in a
+/// single column, a `ScalarValue` stores a single value of a single type, the
+/// equivalent of 1 row and one column.
+///
+/// ```text
+/// ┌────────┐
+/// │ value1 │
+/// │ value2 │ ┌────────┐
+/// │ value3 │ │ value2 │
+/// │ ... │ └────────┘
+/// │ valueN │
+/// └────────┘
+///
+/// Array ScalarValue
+///
+/// stores multiple, stores a single,
+/// possibly null, values of possible null, value
+/// the same type
+/// ```
///
/// # Performance
///
-/// In general, please use arrow [`Array`]s rather than [`ScalarValue`]
whenever
-/// possible, as it is far more efficient for multiple values.
+/// In general, performance will be better using arrow [`Array`]s rather than
+/// [`ScalarValue`], as it is far more efficient to process multiple values at
+/// once (vecctorized processing).
///
/// # Example
/// ```
@@ -99,6 +122,66 @@ use arrow_buffer::NullBuffer;
/// # }
/// ```
///
+/// # Nested Types
+///
+/// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a
+/// single element array of the corresponding type.
+///
+/// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field};
+/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
+/// // Build a struct like: {a: 1, b: "foo"}
+/// let field_a = Field::new("a", DataType::Int32, false);
+/// let field_b = Field::new("b", DataType::Utf8, false);
+///
+/// let s1 = ScalarStructBuilder::new()
+/// .with_scalar(field_a, ScalarValue::from(1i32))
+/// .with_scalar(field_b, ScalarValue::from("foo"))
+/// .build();
+/// ```
+///
+/// ## Example: Creating a null [`ScalarValue::Struct`] using
[`ScalarStructBuilder`]
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field};
+/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder};
+/// // Build a struct representing a NULL value
+/// let fields = vec![
+/// Field::new("a", DataType::Int32, false),
+/// Field::new("b", DataType::Utf8, false),
+/// ];
+///
+/// let s1 = ScalarStructBuilder::new_null(fields);
+/// ```
+///
+/// ## Example: Creating [`ScalarValue::Struct`] directly
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow::datatypes::{DataType, Field, Fields};
+/// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray};
+/// # use datafusion_common::ScalarValue;
+/// // Build a struct like: {a: 1, b: "foo"}
+/// // Field description
+/// let fields = Fields::from(vec![
+/// Field::new("a", DataType::Int32, false),
+/// Field::new("b", DataType::Utf8, false),
+/// ]);
+/// // one row arrays for each field
+/// let arrays: Vec<ArrayRef> = vec![
+/// Arc::new(Int32Array::from(vec![1])),
+/// Arc::new(StringArray::from(vec!["foo"])),
+/// ];
+/// // no nulls for this array
+/// let nulls = None;
+/// let arr = StructArray::new(fields, arrays, nulls);
+///
+/// // Create a ScalarValue::Struct directly
+/// let s1 = ScalarValue::Struct(Arc::new(arr));
+/// ```
+///
+///
/// # Further Reading
/// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html)
for
/// details on datatypes and the
[format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375)
@@ -153,7 +236,8 @@ pub enum ScalarValue {
List(Arc<ListArray>),
/// The array must be a LargeListArray with length 1.
LargeList(Arc<LargeListArray>),
- /// Represents a single element of a [`StructArray`] as an [`ArrayRef`]
+ /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See
+ /// [`ScalarValue`] for examples of how to create instances of this type.
Struct(Arc<StructArray>),
/// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01
Date32(Option<i32>),
@@ -2679,20 +2763,13 @@ impl From<Option<&str>> for ScalarValue {
/// Wrapper to create ScalarValue::Struct for convenience
impl From<Vec<(&str, ScalarValue)>> for ScalarValue {
fn from(value: Vec<(&str, ScalarValue)>) -> Self {
- let (fields, scalars): (SchemaBuilder, Vec<_>) = value
- .into_iter()
- .map(|(name, scalar)| (Field::new(name, scalar.data_type(),
false), scalar))
- .unzip();
-
- let arrays = scalars
+ value
.into_iter()
- .map(|scalar| scalar.to_array().unwrap())
- .collect::<Vec<ArrayRef>>();
-
- let fields = fields.finish().fields;
- let struct_array = StructArray::try_new(fields, arrays, None).unwrap();
-
- Self::Struct(Arc::new(struct_array))
+ .fold(ScalarStructBuilder::new(), |builder, (name, value)| {
+ builder.with_name_and_scalar(name, value)
+ })
+ .build()
+ .unwrap()
}
}
@@ -2710,27 +2787,6 @@ impl From<String> for ScalarValue {
}
}
-// TODO: Remove this after changing to Scalar<T>
-// Wrapper for ScalarValue::Struct that checks the length of the arrays,
without nulls
-impl From<(Fields, Vec<ArrayRef>)> for ScalarValue {
- fn from((fields, arrays): (Fields, Vec<ArrayRef>)) -> Self {
- Self::from((fields, arrays, None))
- }
-}
-
-// TODO: Remove this after changing to Scalar<T>
-// Wrapper for ScalarValue::Struct that checks the length of the arrays
-impl From<(Fields, Vec<ArrayRef>, Option<NullBuffer>)> for ScalarValue {
- fn from(
- (fields, arrays, nulls): (Fields, Vec<ArrayRef>, Option<NullBuffer>),
- ) -> Self {
- for arr in arrays.iter() {
- assert_eq!(arr.len(), 1);
- }
- Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls)))
- }
-}
-
macro_rules! impl_try_from {
($SCALAR:ident, $NATIVE:ident) => {
impl TryFrom<ScalarValue> for $NATIVE {
@@ -3247,6 +3303,7 @@ mod tests {
use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType};
use arrow::util::pretty::pretty_format_columns;
use arrow_buffer::Buffer;
+ use arrow_schema::Fields;
use chrono::NaiveDate;
use rand::Rng;
@@ -3266,31 +3323,33 @@ mod tests {
),
]);
- let arrays = vec![boolean as ArrayRef, int as ArrayRef];
- let fields = Fields::from(vec![
- Field::new("b", DataType::Boolean, false),
- Field::new("c", DataType::Int32, false),
- ]);
- let sv = ScalarValue::from((fields, arrays));
+ let sv = ScalarStructBuilder::new()
+ .with_array(Field::new("b", DataType::Boolean, false), boolean)
+ .with_array(Field::new("c", DataType::Int32, false), int)
+ .build()
+ .unwrap();
+
let struct_arr = sv.to_array().unwrap();
let actual = as_struct_array(&struct_arr).unwrap();
assert_eq!(actual, &expected);
}
#[test]
- #[should_panic(expected = "assertion `left == right` failed")]
+ #[should_panic(
+ expected = "Error building ScalarValue::Struct. Expected array with
exactly one element, found array with 4 elements"
+ )]
fn test_scalar_value_from_for_struct_should_panic() {
- let fields = Fields::from(vec![
- Field::new("bool", DataType::Boolean, false),
- Field::new("i32", DataType::Int32, false),
- ]);
-
- let arrays = vec![
- Arc::new(BooleanArray::from(vec![false, true, false, false])) as
ArrayRef,
- Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
- ];
-
- let _ = ScalarValue::from((fields, arrays));
+ let _ = ScalarStructBuilder::new()
+ .with_array(
+ Field::new("bool", DataType::Boolean, false),
+ Arc::new(BooleanArray::from(vec![false, true, false, false])),
+ )
+ .with_array(
+ Field::new("i32", DataType::Int32, false),
+ Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
+ )
+ .build()
+ .unwrap();
}
#[test]
diff --git a/datafusion/common/src/scalar/struct_builder.rs
b/datafusion/common/src/scalar/struct_builder.rs
new file mode 100644
index 0000000000..926e100417
--- /dev/null
+++ b/datafusion/common/src/scalar/struct_builder.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarStructBuilder`] for building [`ScalarValue::Struct`]
+
+use crate::error::_internal_err;
+use crate::{DataFusionError, Result, ScalarValue};
+use arrow::array::{ArrayRef, StructArray};
+use arrow::datatypes::{DataType, FieldRef, Fields};
+use arrow_schema::Field;
+use std::sync::Arc;
+
+/// Builder for [`ScalarValue::Struct`].
+///
+/// See examples on [`ScalarValue`]
+#[derive(Debug, Default)]
+pub struct ScalarStructBuilder {
+ fields: Vec<FieldRef>,
+ arrays: Vec<ArrayRef>,
+}
+
+impl ScalarStructBuilder {
+ /// Create a new `ScalarStructBuilder`
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Return a new [`ScalarValue::Struct`] with the specified fields and a
+ /// single null value
+ pub fn new_null(fields: impl IntoFields) -> ScalarValue {
+ DataType::Struct(fields.into()).try_into().unwrap()
+ }
+
+ /// Add the specified field and [`ArrayRef`] to the struct.
+ ///
+ /// Note the array should have a single row.
+ pub fn with_array(mut self, field: impl IntoFieldRef, value: ArrayRef) ->
Self {
+ self.fields.push(field.into_field_ref());
+ self.arrays.push(value);
+ self
+ }
+
+ /// Add the specified field and `ScalarValue` to the struct.
+ pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) ->
Self {
+ // valid scalar value should not fail
+ let array = value.to_array().unwrap();
+ self.with_array(field, array)
+ }
+
+ /// Add a field with the specified name and value to the struct.
+ /// the field is created with the specified data type and as non nullable
+ pub fn with_name_and_scalar(self, name: &str, value: ScalarValue) -> Self {
+ let field = Field::new(name, value.data_type(), false);
+ self.with_scalar(field, value)
+ }
+
+ /// Return a [`ScalarValue::Struct`] with the fields and values added so
far
+ ///
+ /// # Errors
+ ///
+ /// If the [`StructArray`] cannot be created (for example if there is a
+ /// mismatch between field types and arrays) or the arrays do not have
+ /// exactly one element.
+ pub fn build(self) -> Result<ScalarValue> {
+ let Self { fields, arrays } = self;
+
+ for array in &arrays {
+ if array.len() != 1 {
+ return _internal_err!(
+ "Error building ScalarValue::Struct. \
+ Expected array with exactly one element, found array with {}
elements",
+ array.len()
+ );
+ }
+ }
+
+ let struct_array = StructArray::try_new(Fields::from(fields), arrays,
None)?;
+ Ok(ScalarValue::Struct(Arc::new(struct_array)))
+ }
+}
+
+/// Trait for converting a type into a [`FieldRef`]
+///
+/// Used to avoid having to call `clone()` on a `FieldRef` when adding a field
to
+/// a `ScalarStructBuilder`.
+///
+/// TODO potentially upstream this to arrow-rs so that we can
+/// use impl `Into<FieldRef>` instead
+pub trait IntoFieldRef {
+ fn into_field_ref(self) -> FieldRef;
+}
+
+impl IntoFieldRef for FieldRef {
+ fn into_field_ref(self) -> FieldRef {
+ self
+ }
+}
+
+impl IntoFieldRef for &FieldRef {
+ fn into_field_ref(self) -> FieldRef {
+ self.clone()
+ }
+}
+
+impl IntoFieldRef for Field {
+ fn into_field_ref(self) -> FieldRef {
+ FieldRef::new(self)
+ }
+}
+
+/// Trait for converting a type into a [`Fields`]
+///
+/// This avoids to avoid having to call clone() on an Arc'd `Fields` when
adding
+/// a field to a `ScalarStructBuilder`
+///
+/// TODO potentially upstream this to arrow-rs so that we can
+/// use impl `Into<Fields>` instead
+pub trait IntoFields {
+ fn into(self) -> Fields;
+}
+
+impl IntoFields for Fields {
+ fn into(self) -> Fields {
+ self
+ }
+}
+
+impl IntoFields for &Fields {
+ fn into(self) -> Fields {
+ self.clone()
+ }
+}
+
+impl IntoFields for Vec<Field> {
+ fn into(self) -> Fields {
+ Fields::from(self)
+ }
+}
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index b6d288da2c..68a318b5a6 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -21,7 +21,6 @@ use std::fmt::{self, Debug, Formatter};
use std::sync::Arc;
use arrow::array::{ArrayRef, FixedSizeListArray};
-use arrow::array::{BooleanArray, Int32Array};
use arrow::csv::WriterBuilder;
use arrow::datatypes::{
DataType, Field, Fields, Int32Type, IntervalDayTimeType,
IntervalMonthDayNanoType,
@@ -42,6 +41,7 @@ use
datafusion_common::file_options::csv_writer::CsvWriterOptions;
use datafusion_common::file_options::parquet_writer::ParquetWriterOptions;
use datafusion_common::file_options::StatementOptions;
use datafusion_common::parsers::CompressionTypeVariant;
+use datafusion_common::scalar::ScalarStructBuilder;
use datafusion_common::{internal_err, not_impl_err, plan_err,
FileTypeWriterOptions};
use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError,
ScalarValue};
use datafusion_common::{FileType, Result};
@@ -932,17 +932,17 @@ fn round_trip_scalar_values() {
ScalarValue::Binary(None),
ScalarValue::LargeBinary(Some(b"bar".to_vec())),
ScalarValue::LargeBinary(None),
- ScalarValue::from((
- vec![
+ ScalarStructBuilder::new()
+ .with_scalar(
Field::new("a", DataType::Int32, true),
+ ScalarValue::from(23i32),
+ )
+ .with_scalar(
Field::new("b", DataType::Boolean, false),
- ]
- .into(),
- vec![
- Arc::new(Int32Array::from(vec![Some(23)])) as ArrayRef,
- Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef,
- ],
- )),
+ ScalarValue::from(false),
+ )
+ .build()
+ .unwrap(),
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("b", DataType::Boolean, false),