This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 41b6a58561 Preserve field metadata across expressions in logical plans
(#6920)
41b6a58561 is described below
commit 41b6a58561c9da5bb1ca988ae82eb8708aadd489
Author: Dexter Duckworth <[email protected]>
AuthorDate: Thu Jul 13 16:28:04 2023 -0400
Preserve field metadata across expressions in logical plans (#6920)
* Added metadata handling to ExprSchemable.
- Column and Alias exprs now forward field metadata.
- All other expressions return empty metadata.
* Fixed bug in ExprSchema metadata method.
* Added test case for expr metadata.
---
datafusion/common/src/dfschema.rs | 22 +++++++++++++
datafusion/expr/src/expr_schema.rs | 63 ++++++++++++++++++++++++++++++++++++--
2 files changed, 83 insertions(+), 2 deletions(-)
diff --git a/datafusion/common/src/dfschema.rs
b/datafusion/common/src/dfschema.rs
index cb07f15b9d..8d9736eb64 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -581,6 +581,9 @@ pub trait ExprSchema: std::fmt::Debug {
/// What is the datatype of this column?
fn data_type(&self, col: &Column) -> Result<&DataType>;
+
+ /// Returns the column's optional metadata.
+ fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>>;
}
// Implement `ExprSchema` for `Arc<DFSchema>`
@@ -592,6 +595,10 @@ impl<P: AsRef<DFSchema> + std::fmt::Debug> ExprSchema for
P {
fn data_type(&self, col: &Column) -> Result<&DataType> {
self.as_ref().data_type(col)
}
+
+ fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>> {
+ ExprSchema::metadata(self.as_ref(), col)
+ }
}
impl ExprSchema for DFSchema {
@@ -602,6 +609,10 @@ impl ExprSchema for DFSchema {
fn data_type(&self, col: &Column) -> Result<&DataType> {
Ok(self.field_from_column(col)?.data_type())
}
+
+ fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>> {
+ Ok(self.field_from_column(col)?.metadata())
+ }
}
/// DFField wraps an Arrow field and adds an optional qualifier
@@ -661,6 +672,10 @@ impl DFField {
self.field.is_nullable()
}
+ pub fn metadata(&self) -> &HashMap<String, String> {
+ self.field.metadata()
+ }
+
/// Returns a string to the `DFField`'s qualified name
pub fn qualified_name(&self) -> String {
if let Some(qualifier) = &self.qualifier {
@@ -708,6 +723,13 @@ impl DFField {
self.field = f.into();
self
}
+
+ /// Return field with new metadata
+ pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
+ let f = self.field().as_ref().clone().with_metadata(metadata);
+ self.field = f.into();
+ self
+ }
}
impl From<FieldRef> for DFField {
diff --git a/datafusion/expr/src/expr_schema.rs
b/datafusion/expr/src/expr_schema.rs
index 76f37e4d6c..2ef2951d49 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -26,6 +26,7 @@ use crate::{LogicalPlan, Projection, Subquery};
use arrow::compute::can_cast_types;
use arrow::datatypes::DataType;
use datafusion_common::{Column, DFField, DFSchema, DataFusionError,
ExprSchema, Result};
+use std::collections::HashMap;
use std::sync::Arc;
/// trait to allow expr to typable with respect to a schema
@@ -36,6 +37,9 @@ pub trait ExprSchemable {
/// given a schema, return the nullability of the expr
fn nullable<S: ExprSchema>(&self, input_schema: &S) -> Result<bool>;
+ /// given a schema, return the expr's optional metadata
+ fn metadata<S: ExprSchema>(&self, schema: &S) -> Result<HashMap<String,
String>>;
+
/// convert to a field with respect to a schema
fn to_field(&self, input_schema: &DFSchema) -> Result<DFField>;
@@ -275,6 +279,14 @@ impl ExprSchemable for Expr {
}
}
+ fn metadata<S: ExprSchema>(&self, schema: &S) -> Result<HashMap<String,
String>> {
+ match self {
+ Expr::Column(c) => Ok(schema.metadata(c)?.clone()),
+ Expr::Alias(Alias { expr, .. }) => expr.metadata(schema),
+ _ => Ok(HashMap::new()),
+ }
+ }
+
/// Returns a [arrow::datatypes::Field] compatible with this expression.
///
/// So for example, a projected expression `col(c1) + col(c2)` is
@@ -286,12 +298,14 @@ impl ExprSchemable for Expr {
&c.name,
self.get_type(input_schema)?,
self.nullable(input_schema)?,
- )),
+ )
+ .with_metadata(self.metadata(input_schema)?)),
_ => Ok(DFField::new_unqualified(
&self.display_name()?,
self.get_type(input_schema)?,
self.nullable(input_schema)?,
- )),
+ )
+ .with_metadata(self.metadata(input_schema)?)),
}
}
@@ -465,11 +479,46 @@ mod tests {
);
}
+ #[test]
+ fn test_expr_metadata() {
+ let mut meta = HashMap::new();
+ meta.insert("bar".to_string(), "buzz".to_string());
+ let expr = col("foo");
+ let schema = MockExprSchema::new()
+ .with_data_type(DataType::Int32)
+ .with_metadata(meta.clone());
+
+ // col and alias should be metadata-preserving
+ assert_eq!(meta, expr.metadata(&schema).unwrap());
+ assert_eq!(meta, expr.clone().alias("bar").metadata(&schema).unwrap());
+
+ // cast should drop input metadata since the type has changed
+ assert_eq!(
+ HashMap::new(),
+ expr.clone()
+ .cast_to(&DataType::Int64, &schema)
+ .unwrap()
+ .metadata(&schema)
+ .unwrap()
+ );
+
+ let schema = DFSchema::new_with_metadata(
+ vec![DFField::new_unqualified("foo", DataType::Int32, true)
+ .with_metadata(meta.clone())],
+ HashMap::new(),
+ )
+ .unwrap();
+
+ // verify to_field method populates metadata
+ assert_eq!(&meta, expr.to_field(&schema).unwrap().metadata());
+ }
+
#[derive(Debug)]
struct MockExprSchema {
nullable: bool,
data_type: DataType,
error_on_nullable: bool,
+ metadata: HashMap<String, String>,
}
impl MockExprSchema {
@@ -478,6 +527,7 @@ mod tests {
nullable: false,
data_type: DataType::Null,
error_on_nullable: false,
+ metadata: HashMap::new(),
}
}
@@ -495,6 +545,11 @@ mod tests {
self.error_on_nullable = error_on_nullable;
self
}
+
+ fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
+ self.metadata = metadata;
+ self
+ }
}
impl ExprSchema for MockExprSchema {
@@ -509,5 +564,9 @@ mod tests {
fn data_type(&self, _col: &Column) -> Result<&DataType> {
Ok(&self.data_type)
}
+
+ fn metadata(&self, _col: &Column) -> Result<&HashMap<String, String>> {
+ Ok(&self.metadata)
+ }
}
}