alamb commented on code in PR #17986:
URL: https://github.com/apache/datafusion/pull/17986#discussion_r2453147872
##########
datafusion/common/src/param_value.rs:
##########
@@ -16,22 +16,37 @@
// under the License.
use crate::error::{_plan_datafusion_err, _plan_err};
+use crate::metadata::{check_metadata_with_storage_equal, LiteralValue};
use crate::{Result, ScalarValue};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
use std::collections::HashMap;
/// The parameter value corresponding to the placeholder
#[derive(Debug, Clone)]
pub enum ParamValues {
/// For positional query parameters, like `SELECT * FROM test WHERE a > $1
AND b = $2`
- List(Vec<ScalarValue>),
+ List(Vec<LiteralValue>),
/// For named query parameters, like `SELECT * FROM test WHERE a > $foo
AND b = $goo`
- Map(HashMap<String, ScalarValue>),
+ Map(HashMap<String, LiteralValue>),
}
impl ParamValues {
- /// Verify parameter list length and type
+ /// Verify parameter list length and DataType
+ ///
+ /// Use [`ParamValues::verify_fields`] to ensure field metadata is
considered when
+ /// computing type equality.
+ #[deprecated]
Review Comment:
Can we please add a note saying "use varify_fields" as part of the notice as
described here:
https://datafusion.apache.org/contributor-guide/api-health.html#deprecation-guidelines
Something like
```suggestion
#[deprecated(since = "51.0.0", note = "Use verifiy_fields instead")]
```
##########
datafusion/expr/src/expr.rs:
##########
@@ -1370,13 +1142,21 @@ pub struct Placeholder {
/// The identifier of the parameter, including the leading `$` (e.g,
`"$1"` or `"$foo"`)
pub id: String,
/// The type the parameter will be filled in with
- pub data_type: Option<DataType>,
+ pub field: Option<FieldRef>,
}
impl Placeholder {
/// Create a new Placeholder expression
pub fn new(id: String, data_type: Option<DataType>) -> Self {
- Self { id, data_type }
+ Self {
+ id,
+ field: data_type.map(|dt| Arc::new(Field::new("", dt, true))),
+ }
+ }
+
+ /// Create a new Placeholder expression from a Field
+ pub fn new_with_metadata(id: String, field: Option<FieldRef>) -> Self {
Review Comment:
I think this would be more consistent if it was named `new_with_field`
rather than `new_with_metadata`
##########
datafusion/sql/src/planner.rs:
##########
@@ -587,40 +589,70 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
})
}
- pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) ->
Result<DataType> {
+ pub(crate) fn convert_data_type_to_field(
+ &self,
+ sql_type: &SQLDataType,
+ ) -> Result<FieldRef> {
// First check if any of the registered type_planner can handle this
type
if let Some(type_planner) = self.context_provider.get_type_planner() {
if let Some(data_type) = type_planner.plan_type(sql_type)? {
- return Ok(data_type);
+ return Ok(Field::new("", data_type, true).into());
}
}
// If no type_planner can handle this type, use the default conversion
match sql_type {
SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type))
=> {
// Arrays may be multi-dimensional.
- let inner_data_type = self.convert_data_type(inner_sql_type)?;
- Ok(DataType::new_list(inner_data_type, true))
+ let inner_data_type =
self.convert_data_type_to_field(inner_sql_type)?;
+
+ // Lists are allowed to have an arbitrarily named field;
+ // however, a name other than 'item' will cause it to fail an
+ // == check against a more idiomatically created list in
+ // arrow-rs which causes issues. We use the low-level
+ // constructor here to preserve extension metadata from the
+ // child type.
+ Ok(Field::new(
+ "",
+ DataType::List(
+
inner_data_type.as_ref().clone().with_name("item").into(),
Review Comment:
You can potentially avoid a copy using `Arc::unwrap_or_clone` here:
```suggestion
Arc::unwrap_or_clone(inner_data_type).with_name("item").into(),
```
]
##########
datafusion/expr/src/expr.rs:
##########
@@ -1843,7 +1623,7 @@ impl Expr {
/// ```
/// # use datafusion_expr::col;
/// # use std::collections::HashMap;
- /// # use datafusion_expr::expr::FieldMetadata;
+ /// # use datafusion_common::metadata::FieldMetadata;
Review Comment:
one way we have tried to help users on upgrade is to leave a `pub use` when
we move the struct
In this case, I think thatw means if you put this line in
`datafusion_expr::expr`
```rust
pub use datafusion_common::metadata::FieldMetadata;
```
Existing code will still compile
##########
datafusion/expr/src/expr.rs:
##########
@@ -1370,13 +1142,21 @@ pub struct Placeholder {
/// The identifier of the parameter, including the leading `$` (e.g,
`"$1"` or `"$foo"`)
pub id: String,
/// The type the parameter will be filled in with
- pub data_type: Option<DataType>,
+ pub field: Option<FieldRef>,
}
impl Placeholder {
/// Create a new Placeholder expression
pub fn new(id: String, data_type: Option<DataType>) -> Self {
Review Comment:
I think we should probably deprecate this one too perhaps -- and direct
users to the one below
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -413,6 +414,8 @@ message Wildcard {
message PlaceholderNode {
string id = 1;
datafusion_common.ArrowType data_type = 2;
Review Comment:
I recommend adding a comment here explaining that this doesn't serialize
the Field directly to maintain backwards compatibiilty with older version
##########
datafusion/proto/src/logical_plan/from_proto.rs:
##########
@@ -628,12 +629,22 @@ pub fn parse_expr(
ExprType::Rollup(RollupNode { expr }) => Ok(Expr::GroupingSet(
GroupingSet::Rollup(parse_exprs(expr, registry, codec)?),
)),
- ExprType::Placeholder(PlaceholderNode { id, data_type }) => match
data_type {
+ ExprType::Placeholder(PlaceholderNode {
+ id,
+ data_type,
+ nullable,
+ metadata,
+ }) => match data_type {
None => Ok(Expr::Placeholder(Placeholder::new(id.clone(), None))),
Review Comment:
This is a very clever way to keep the protobuf backwards compatible 👍
It won't faithfully preserve the field name, if there is one, however.
Perhaps we should also serialize the field name explicitly
##########
datafusion/expr/src/logical_plan/statement.rs:
##########
@@ -192,7 +201,7 @@ pub struct Prepare {
/// The name of the statement
pub name: String,
/// Data types of the parameters ([`Expr::Placeholder`])
- pub data_types: Vec<DataType>,
+ pub fields: Vec<FieldRef>,
Review Comment:
this is also a breaking API change (I don't have any suggestions about how
to avoid this or ways to make the upgrade easier)
##########
datafusion/sql/src/planner.rs:
##########
@@ -587,40 +589,70 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
})
}
- pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) ->
Result<DataType> {
+ pub(crate) fn convert_data_type_to_field(
+ &self,
+ sql_type: &SQLDataType,
+ ) -> Result<FieldRef> {
// First check if any of the registered type_planner can handle this
type
if let Some(type_planner) = self.context_provider.get_type_planner() {
if let Some(data_type) = type_planner.plan_type(sql_type)? {
- return Ok(data_type);
+ return Ok(Field::new("", data_type, true).into());
}
}
// If no type_planner can handle this type, use the default conversion
match sql_type {
SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type))
=> {
// Arrays may be multi-dimensional.
- let inner_data_type = self.convert_data_type(inner_sql_type)?;
- Ok(DataType::new_list(inner_data_type, true))
+ let inner_data_type =
self.convert_data_type_to_field(inner_sql_type)?;
Review Comment:
This confused me for a while -- I missed `inner_data_type` is now a Field
Perhaps this would be easier to understand if the datatype was named
`inner_field`
##########
datafusion/sql/src/planner.rs:
##########
@@ -587,40 +589,70 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
})
}
- pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) ->
Result<DataType> {
+ pub(crate) fn convert_data_type_to_field(
+ &self,
+ sql_type: &SQLDataType,
+ ) -> Result<FieldRef> {
// First check if any of the registered type_planner can handle this
type
if let Some(type_planner) = self.context_provider.get_type_planner() {
if let Some(data_type) = type_planner.plan_type(sql_type)? {
- return Ok(data_type);
+ return Ok(Field::new("", data_type, true).into());
}
}
// If no type_planner can handle this type, use the default conversion
match sql_type {
SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type))
=> {
// Arrays may be multi-dimensional.
- let inner_data_type = self.convert_data_type(inner_sql_type)?;
- Ok(DataType::new_list(inner_data_type, true))
+ let inner_data_type =
self.convert_data_type_to_field(inner_sql_type)?;
+
+ // Lists are allowed to have an arbitrarily named field;
+ // however, a name other than 'item' will cause it to fail an
+ // == check against a more idiomatically created list in
+ // arrow-rs which causes issues. We use the low-level
+ // constructor here to preserve extension metadata from the
+ // child type.
+ Ok(Field::new(
Review Comment:
It would be nice to start wrapping this pattern into helper methods /
extension traits .
I started playing around and was able to move a lot of the mechanics around
and simpify them:
- https://github.com/paleolimbot/datafusion/pull/1
##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -888,9 +888,33 @@ impl AsLogicalPlan for LogicalPlanNode {
.iter()
.map(DataType::try_from)
.collect::<Result<_, _>>()?;
- LogicalPlanBuilder::from(input)
- .prepare(prepare.name.clone(), data_types)?
- .build()
+ let fields: Vec<Field> = prepare
+ .fields
+ .iter()
+ .map(Field::try_from)
+ .collect::<Result<_, _>>()?;
+
+ // If the fields are empty this may have been generated by an
Review Comment:
I don't think there is a precedent that I know of -- see
https://docs.rs/datafusion-proto/latest/datafusion_proto/#version-compatibility
It would be cool to have a way to test this, even if we don't guarantee it.
But given there is no precedent I think we can file a follow on ticket to track
this feature
##########
datafusion/common/src/metadata.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::BTreeMap, sync::Arc};
+
+use arrow::datatypes::{DataType, Field};
+use hashbrown::HashMap;
+
+use crate::{error::_plan_err, DataFusionError, ScalarValue};
+
+/// A [`ScalarValue`] with optional [`FieldMetadata`]
+#[derive(Debug, Clone)]
+pub struct Literal {
+ pub value: ScalarValue,
+ pub metadata: Option<FieldMetadata>,
+}
+
+impl Literal {
+ /// Create a new Literal from a scalar value with optional
[`FieldMetadata`]
+ pub fn new(value: ScalarValue, metadata: Option<FieldMetadata>) -> Self {
+ Self { value, metadata }
+ }
+
+ /// Access the underlying [ScalarValue] storage
+ pub fn value(&self) -> &ScalarValue {
+ &self.value
+ }
+
+ /// Access the [FieldMetadata] attached to this value, if any
+ pub fn metadata(&self) -> Option<&FieldMetadata> {
+ self.metadata.as_ref()
+ }
+
+ /// Consume self and return components
+ pub fn into_inner(self) -> (ScalarValue, Option<FieldMetadata>) {
+ (self.value, self.metadata)
+ }
+
+ /// Cast this literal's storage type
+ ///
+ /// This operation assumes that if the underlying [ScalarValue] can be
casted
+ /// to a given type that any extension type represented by the metadata is
also
+ /// valid.
+ pub fn cast_storage_to(
+ &self,
+ target_type: &DataType,
+ ) -> Result<Self, DataFusionError> {
+ let new_value = self.value().cast_to(target_type)?;
+ Ok(Literal::new(new_value, self.metadata.clone()))
+ }
+}
+
+/// Assert equality of data types where one or both sides may have field
metadata
+///
+/// This currently compares absent metadata (e.g., one side was a DataType) and
+/// empty metadata (e.g., one side was a field where the field had no metadata)
+/// as equal and uses byte-for-byte comparison for the keys and values of the
+/// fields, even though this is potentially too strict for some cases (e.g.,
+/// extension types where extension metadata is represented by JSON, or cases
+/// where field metadata is orthogonal to the interpretation of the data type).
+///
+/// Returns a planning error with suitably formatted type representations if
+/// actual and expected do not compare to equal.
+pub fn check_metadata_with_storage_equal(
+ actual: (
+ &DataType,
+ Option<&std::collections::HashMap<String, String>>,
+ ),
+ expected: (
+ &DataType,
+ Option<&std::collections::HashMap<String, String>>,
+ ),
+ what: &str,
+ context: &str,
+) -> Result<(), DataFusionError> {
+ if actual.0 != expected.0 {
+ return _plan_err!(
+ "Expected {what} of type {}, got {}{context}",
+ format_type_and_metadata(expected.0, expected.1),
+ format_type_and_metadata(actual.0, actual.1)
+ );
+ }
+
+ let metadata_equal = match (actual.1, expected.1) {
+ (None, None) => true,
+ (None, Some(expected_metadata)) => expected_metadata.is_empty(),
+ (Some(actual_metadata), None) => actual_metadata.is_empty(),
+ (Some(actual_metadata), Some(expected_metadata)) => {
+ actual_metadata == expected_metadata
+ }
+ };
+
+ if !metadata_equal {
+ return _plan_err!(
+ "Expected {what} of type {}, got {}{context}",
+ format_type_and_metadata(expected.0, expected.1),
+ format_type_and_metadata(actual.0, actual.1)
+ );
+ }
+
+ Ok(())
+}
+
+/// Given a data type represented by storage and optional metadata, generate
+/// a user-facing string
+///
+/// This function exists to reduce the number of Field debug strings that are
+/// used to communicate type information in error messages and plan explain
+/// renderings.
+pub fn format_type_and_metadata(
+ data_type: &DataType,
+ metadata: Option<&std::collections::HashMap<String, String>>,
+) -> String {
Review Comment:
I started a ticket to discuss how such an API might look like:
- https://github.com/apache/datafusion/issues/18223
##########
datafusion/expr/src/expr.rs:
##########
@@ -1370,13 +1142,21 @@ pub struct Placeholder {
/// The identifier of the parameter, including the leading `$` (e.g,
`"$1"` or `"$foo"`)
pub id: String,
/// The type the parameter will be filled in with
- pub data_type: Option<DataType>,
+ pub field: Option<FieldRef>,
Review Comment:
I think this is a reasonable change, although to your point it will be a
breaking API change and may cause issues for downstream users.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]