adriangb commented on code in PR #22239:
URL: https://github.com/apache/datafusion/pull/22239#discussion_r3293847746
##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name:
ScalarValue) -> Result<Column
}
}
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's',
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+ let [base, field_name, rest @ ..] = args else {
+ return None;
+ };
+
+ // The accessed field name must be a non-empty string literal.
+ let Expr::Literal(field_name, _) = field_name else {
+ return None;
+ };
+ let field_name = field_name
+ .try_as_str()
+ .flatten()
+ .filter(|s| !s.is_empty())?;
+
+ let Expr::ScalarFunction(ScalarFunction {
+ func,
+ args: ctor_args,
+ }) = base
+ else {
+ return None;
+ };
+
+ let value = if func.inner().is::<NamedStructFunc>() {
+ // named_struct(name1, value1, name2, value2, ...)
+ if !ctor_args.len().is_multiple_of(2) {
+ return None;
+ }
+ let mut matched = None;
+ for pair in ctor_args.chunks_exact(2) {
+ // Every name must be a literal string: a non-literal name could
+ // evaluate to `field_name` at runtime and shadow a later match,
+ // so we cannot safely pick a field unless all names are known.
+ let Expr::Literal(name, _) = &pair[0] else {
+ return None;
+ };
+ let name = name.try_as_str().flatten()?;
+ // `column_by_name` resolves to the first match, so do the same.
+ if matched.is_none() && name == field_name {
+ matched = Some(&pair[1]);
+ }
+ }
+ matched?.clone()
+ } else if func.inner().is::<StructFunc>() {
+ // struct(value0, value1, ...) produces fields named c0, c1, ...
+ let index: usize = field_name.strip_prefix('c')?.parse().ok()?;
+ // Reject non-canonical spellings (e.g. "c01") that name no real field.
+ if format!("c{index}") != field_name {
+ return None;
+ }
+ ctor_args.get(index)?.clone()
+ } else {
+ return None;
+ };
+
+ if rest.is_empty() {
+ return Some(value);
+ }
+
+ // Remaining path elements: re-wrap as get_field(value, rest...) and let
+ // the simplifier resolve the rest on a subsequent pass.
+ let mut new_args = Vec::with_capacity(rest.len() + 1);
+ new_args.push(value);
+ new_args.extend_from_slice(rest);
+ Some(Expr::ScalarFunction(ScalarFunction::new_udf(
+ Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())),
+ new_args,
+ )))
+}
Review Comment:
Done in cd20ebbb60 — added a `get_field_udf()` helper backed by a `OnceLock`
and routed both construction sites (the re-wrap path here and the flatten path
in `simplify`) through it.
##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name:
ScalarValue) -> Result<Column
}
}
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's',
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+ let [base, field_name, rest @ ..] = args else {
+ return None;
+ };
+
+ // The accessed field name must be a non-empty string literal.
+ let Expr::Literal(field_name, _) = field_name else {
+ return None;
+ };
+ let field_name = field_name
+ .try_as_str()
+ .flatten()
+ .filter(|s| !s.is_empty())?;
+
+ let Expr::ScalarFunction(ScalarFunction {
+ func,
+ args: ctor_args,
+ }) = base
+ else {
+ return None;
+ };
+
+ let value = if func.inner().is::<NamedStructFunc>() {
+ // named_struct(name1, value1, name2, value2, ...)
+ if !ctor_args.len().is_multiple_of(2) {
+ return None;
+ }
+ let mut matched = None;
+ for pair in ctor_args.chunks_exact(2) {
+ // Every name must be a literal string: a non-literal name could
+ // evaluate to `field_name` at runtime and shadow a later match,
+ // so we cannot safely pick a field unless all names are known.
+ let Expr::Literal(name, _) = &pair[0] else {
+ return None;
+ };
+ let name = name.try_as_str().flatten()?;
+ // `column_by_name` resolves to the first match, so do the same.
+ if matched.is_none() && name == field_name {
+ matched = Some(&pair[1]);
+ }
+ }
+ matched?.clone()
+ } else if func.inner().is::<StructFunc>() {
+ // struct(value0, value1, ...) produces fields named c0, c1, ...
+ let index: usize = field_name.strip_prefix('c')?.parse().ok()?;
+ // Reject non-canonical spellings (e.g. "c01") that name no real field.
+ if format!("c{index}") != field_name {
+ return None;
+ }
+ ctor_args.get(index)?.clone()
+ } else {
+ return None;
+ };
+
+ if rest.is_empty() {
+ return Some(value);
+ }
+
+ // Remaining path elements: re-wrap as get_field(value, rest...) and let
+ // the simplifier resolve the rest on a subsequent pass.
+ let mut new_args = Vec::with_capacity(rest.len() + 1);
+ new_args.push(value);
+ new_args.extend_from_slice(rest);
+ Some(Expr::ScalarFunction(ScalarFunction::new_udf(
+ Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())),
+ new_args,
+ )))
+}
+
impl GetFieldFunc {
pub fn new() -> Self {
Self {
Review Comment:
Done in 4f70d80edc — added a paragraph to the function doc comment noting
that replacing the access drops the other fields' expressions so they are no
longer evaluated (e.g. `get_field(named_struct('a', 1/0, 'b', x), 'b')` =>
`x`), which matches the optimizer's immutable-expression contract.
##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name:
ScalarValue) -> Result<Column
}
}
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's',
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+ let [base, field_name, rest @ ..] = args else {
+ return None;
+ };
+
+ // The accessed field name must be a non-empty string literal.
+ let Expr::Literal(field_name, _) = field_name else {
+ return None;
+ };
+ let field_name = field_name
+ .try_as_str()
+ .flatten()
+ .filter(|s| !s.is_empty())?;
+
+ let Expr::ScalarFunction(ScalarFunction {
+ func,
+ args: ctor_args,
+ }) = base
+ else {
+ return None;
+ };
+
+ let value = if func.inner().is::<NamedStructFunc>() {
+ // named_struct(name1, value1, name2, value2, ...)
+ if !ctor_args.len().is_multiple_of(2) {
+ return None;
+ }
+ let mut matched = None;
+ for pair in ctor_args.chunks_exact(2) {
+ // Every name must be a literal string: a non-literal name could
+ // evaluate to `field_name` at runtime and shadow a later match,
+ // so we cannot safely pick a field unless all names are known.
+ let Expr::Literal(name, _) = &pair[0] else {
+ return None;
+ };
+ let name = name.try_as_str().flatten()?;
+ // `column_by_name` resolves to the first match, so do the same.
+ if matched.is_none() && name == field_name {
+ matched = Some(&pair[1]);
+ }
+ }
+ matched?.clone()
+ } else if func.inner().is::<StructFunc>() {
+ // struct(value0, value1, ...) produces fields named c0, c1, ...
+ let index: usize = field_name.strip_prefix('c')?.parse().ok()?;
+ // Reject non-canonical spellings (e.g. "c01") that name no real field.
+ if format!("c{index}") != field_name {
+ return None;
+ }
+ ctor_args.get(index)?.clone()
Review Comment:
Done in 9276b31f8f — expanded the comment to call out that bailing on a
non-literal name is a deliberate conservative approximation, applied even when
a matching literal field was already seen earlier in the loop, since a later
dynamic name could still resolve to the same field and pick a different value
at runtime.
##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -828,4 +924,181 @@ mod tests {
let args = vec![ExpressionPlacement::Literal,
ExpressionPlacement::Literal];
assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
}
+
+ // --- get_field over struct constructor simplification
--------------------
+
+ use datafusion_common::Column;
+ use datafusion_expr::simplify::SimplifyContext;
+
+ /// A non-empty string literal expression.
+ fn lit_str(s: &str) -> Expr {
+ Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None)
+ }
+
+ /// A column reference expression.
+ fn col(name: &str) -> Expr {
+ Expr::Column(Column::from_name(name))
+ }
+
+ fn scalar_fn(udf: ScalarUDF, args: Vec<Expr>) -> Expr {
+ Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), args))
+ }
+
+ /// `named_struct(name1, value1, name2, value2, ...)`.
+ fn named_struct(pairs: Vec<(&str, Expr)>) -> Expr {
+ let args = pairs
+ .into_iter()
+ .flat_map(|(name, value)| [lit_str(name), value])
+ .collect();
+ scalar_fn(ScalarUDF::new_from_impl(NamedStructFunc::new()), args)
+ }
+
+ /// `struct(value0, value1, ...)`.
+ fn struct_fn(values: Vec<Expr>) -> Expr {
+ scalar_fn(ScalarUDF::new_from_impl(StructFunc::new()), values)
+ }
+
+ /// `get_field(args...)`.
+ fn get_field(args: Vec<Expr>) -> Expr {
+ scalar_fn(ScalarUDF::new_from_impl(GetFieldFunc::new()), args)
+ }
+
+ /// Run `GetFieldFunc::simplify` once and return the rewritten expression,
+ /// panicking if the input was left unchanged.
+ fn simplified(args: Vec<Expr>) -> Expr {
+ match GetFieldFunc::new()
+ .simplify(args, &SimplifyContext::default())
+ .unwrap()
+ {
+ ExprSimplifyResult::Simplified(expr) => expr,
+ ExprSimplifyResult::Original(args) => {
+ panic!("expected the expression to be simplified, got
{args:?}")
+ }
+ }
+ }
+
+ /// Assert that `GetFieldFunc::simplify` leaves the arguments unchanged.
+ fn assert_not_simplified(args: Vec<Expr>) {
+ match GetFieldFunc::new()
+ .simplify(args.clone(), &SimplifyContext::default())
+ .unwrap()
+ {
+ ExprSimplifyResult::Original(unchanged) => assert_eq!(unchanged,
args),
+ ExprSimplifyResult::Simplified(expr) => {
+ panic!("expected no simplification, got {expr:?}")
+ }
+ }
+ }
+
+ #[test]
+ fn simplify_get_field_named_struct_returns_matching_value() {
+ // get_field(named_struct('min', a, 'max', b), 'max') => b
+ let expr = get_field(vec![
+ named_struct(vec![("min", col("a")), ("max", col("b"))]),
+ lit_str("max"),
+ ]);
+ let Expr::ScalarFunction(ScalarFunction { args, .. }) = expr else {
+ unreachable!()
+ };
+ assert_eq!(simplified(args), col("b"));
+ }
+
Review Comment:
Done in bd4e2782b9 — the test now builds `args` directly and passes it to
`simplified(args)`, matching the neighboring tests.
##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name:
ScalarValue) -> Result<Column
}
}
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's',
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+ let [base, field_name, rest @ ..] = args else {
+ return None;
+ };
+
+ // The accessed field name must be a non-empty string literal.
+ let Expr::Literal(field_name, _) = field_name else {
+ return None;
+ };
+ let field_name = field_name
+ .try_as_str()
+ .flatten()
+ .filter(|s| !s.is_empty())?;
+
Review Comment:
Done in 52e7567b55 — added `simplify_get_field_null_field_name_left_alone`
covering the `ScalarValue::Utf8(None)` case, asserting the expression is left
unchanged.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]