adriangb commented on code in PR #22239:
URL: https://github.com/apache/datafusion/pull/22239#discussion_r3293847812


##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name: 
ScalarValue) -> Result<Column
     }
 }
 
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's', 
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+    let [base, field_name, rest @ ..] = args else {
+        return None;
+    };
+
+    // The accessed field name must be a non-empty string literal.
+    let Expr::Literal(field_name, _) = field_name else {
+        return None;
+    };
+    let field_name = field_name
+        .try_as_str()
+        .flatten()
+        .filter(|s| !s.is_empty())?;
+
+    let Expr::ScalarFunction(ScalarFunction {
+        func,
+        args: ctor_args,
+    }) = base
+    else {
+        return None;
+    };
+
+    let value = if func.inner().is::<NamedStructFunc>() {
+        // named_struct(name1, value1, name2, value2, ...)
+        if !ctor_args.len().is_multiple_of(2) {
+            return None;
+        }
+        let mut matched = None;
+        for pair in ctor_args.chunks_exact(2) {
+            // Every name must be a literal string: a non-literal name could
+            // evaluate to `field_name` at runtime and shadow a later match,
+            // so we cannot safely pick a field unless all names are known.
+            let Expr::Literal(name, _) = &pair[0] else {
+                return None;
+            };
+            let name = name.try_as_str().flatten()?;
+            // `column_by_name` resolves to the first match, so do the same.
+            if matched.is_none() && name == field_name {
+                matched = Some(&pair[1]);
+            }
+        }
+        matched?.clone()
+    } else if func.inner().is::<StructFunc>() {
+        // struct(value0, value1, ...) produces fields named c0, c1, ...
+        let index: usize = field_name.strip_prefix('c')?.parse().ok()?;
+        // Reject non-canonical spellings (e.g. "c01") that name no real field.
+        if format!("c{index}") != field_name {
+            return None;
+        }
+        ctor_args.get(index)?.clone()

Review Comment:
   Done — expanded the comment to call out that bailing on a non-literal field 
name is a deliberate conservative approximation rather than an accidental 
missed optimization.
   
   To be precise about the rationale: a non-literal name appearing *after* the 
first literal match can't change the result, since `column_by_name` returns the 
first match. The bail is only strictly *required* when the non-literal name 
appears *before* the first match (it could shadow it at runtime); applying it 
unconditionally is the conservative-for-simplicity choice. Reworded the comment 
accordingly in f799333b0f (force-pushed; new branch tip b2e0978382).



##########
datafusion/functions/src/core/getfield.rs:
##########
@@ -249,6 +252,96 @@ fn extract_single_field(base: ColumnarValue, name: 
ScalarValue) -> Result<Column
     }
 }
 
+/// Try to simplify a `get_field` call whose base is an inline struct
+/// constructor by resolving the field access at plan time.
+///
+/// Handles both struct constructors:
+/// * `named_struct('a', x, 'b', y)` — fields are looked up by name.
+/// * `struct(x, y)` — fields are positional and named `c0`, `c1`, ...
+///
+/// For example:
+/// * `get_field(named_struct('min', a, 'max', b), 'max')` => `b`
+/// * `get_field(struct(a, b), 'c1')` => `b`
+///
+/// `args` is the (already flattened) argument list of the `get_field` call:
+/// `[base, field_name, rest_of_path...]`. When extra path elements remain
+/// after resolving the first one (`get_field(named_struct('s', inner), 's', 
'k')`),
+/// the resolved value is re-wrapped in a `get_field` call for the remaining
+/// path so the simplifier can recurse into it on the next pass.
+///
+/// Returns `None` — leaving the expression untouched — whenever the rewrite
+/// cannot be proven safe, e.g. a non-literal field name, a `named_struct`
+/// with a non-literal field name (which might shadow the requested field at
+/// runtime), or a field the constructor does not produce.
+fn simplify_get_field_over_struct_constructor(args: &[Expr]) -> Option<Expr> {
+    let [base, field_name, rest @ ..] = args else {
+        return None;
+    };
+
+    // The accessed field name must be a non-empty string literal.
+    let Expr::Literal(field_name, _) = field_name else {
+        return None;
+    };
+    let field_name = field_name
+        .try_as_str()
+        .flatten()
+        .filter(|s| !s.is_empty())?;
+
+    let Expr::ScalarFunction(ScalarFunction {
+        func,
+        args: ctor_args,
+    }) = base
+    else {
+        return None;
+    };
+
+    let value = if func.inner().is::<NamedStructFunc>() {
+        // named_struct(name1, value1, name2, value2, ...)
+        if !ctor_args.len().is_multiple_of(2) {
+            return None;
+        }
+        let mut matched = None;
+        for pair in ctor_args.chunks_exact(2) {
+            // Every name must be a literal string: a non-literal name could
+            // evaluate to `field_name` at runtime and shadow a later match,
+            // so we cannot safely pick a field unless all names are known.
+            let Expr::Literal(name, _) = &pair[0] else {
+                return None;
+            };
+            let name = name.try_as_str().flatten()?;
+            // `column_by_name` resolves to the first match, so do the same.
+            if matched.is_none() && name == field_name {
+                matched = Some(&pair[1]);
+            }
+        }
+        matched?.clone()
+    } else if func.inner().is::<StructFunc>() {
+        // struct(value0, value1, ...) produces fields named c0, c1, ...
+        let index: usize = field_name.strip_prefix('c')?.parse().ok()?;
+        // Reject non-canonical spellings (e.g. "c01") that name no real field.
+        if format!("c{index}") != field_name {
+            return None;
+        }
+        ctor_args.get(index)?.clone()

Review Comment:
   Correction: my first reply (and the original wording in 9276b31f8f) had the 
rationale backwards. A *later* dynamic name cannot pick a different value, 
because `column_by_name` returns the first match and a non-literal name after 
an already-found match can never precede it — so bailing there is a deliberate 
conservative approximation we accept for simplicity, *not* a correctness 
requirement. The bail is only strictly required when the non-literal name 
appears *before* the first match. Reworded the comment accordingly in 
f799333b0f (force-pushed; new branch tip b2e0978382).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to