Jefffrey commented on code in PR #21262:
URL: https://github.com/apache/datafusion/pull/21262#discussion_r3226121281
##########
datafusion/core/src/execution/session_state.rs:
##########
@@ -1054,6 +1054,37 @@ pub struct SessionStateBuilder {
physical_optimizer_rules: Option<Vec<Arc<dyn PhysicalOptimizerRule + Send
+ Sync>>>,
}
+fn collect_registry_values_by_canonical_name<T: ?Sized, F>(
+ entries: HashMap<String, Arc<T>>,
+ canonical_name: F,
+) -> Vec<Arc<T>>
+where
+ F: Fn(&Arc<T>) -> String,
+{
+ // First, collect canonical key mappings (if present), which represent the
+ // current authoritative binding for a canonical name.
+ let mut by_name = std::collections::BTreeMap::new();
Review Comment:
I think relying on a btreemap like this still has issues, because when we
iterate on it later we iterate based on the total key ordering, not based on
insertion time. That can lead to issues where based on the names of the UDFs
the roundtripped state is still incorrect. For example see this test case:
```rust
#[test]
fn test123() -> Result<()> {
let mut state = SessionStateBuilder::new().build();
let udf1 = datafusion_expr::create_udf(
"z_to_char",
vec![DataType::Utf8],
DataType::Utf8,
datafusion_expr::Volatility::Immutable,
Arc::new(|_args| {
Ok(datafusion_expr::ColumnarValue::Scalar(
datafusion_common::ScalarValue::Utf8(None),
))
}),
)
.with_aliases(["z_date_format"]);
let udf2 = datafusion_expr::create_udf(
"to_character",
vec![DataType::Utf8],
DataType::Utf8,
datafusion_expr::Volatility::Immutable,
Arc::new(|_args| {
Ok(datafusion_expr::ColumnarValue::Scalar(
datafusion_common::ScalarValue::Utf8(None),
))
}),
)
.with_aliases(["z_to_char"]);
state.register_udf(Arc::new(udf1))?;
state.register_udf(Arc::new(udf2))?;
let roundtrip =
SessionStateBuilder::new_from_existing(state.clone());
let roundtrip = roundtrip.build();
dbg!(state.scalar_functions(), roundtrip.scalar_functions());
assert_eq!(state.scalar_functions(), roundtrip.scalar_functions());
Ok(())
}
```
The before state has this mapping:
```
[datafusion/core/src/execution/session_state.rs:2668:9]
state.scalar_functions() = {
"z_date_format": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "z_to_char",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_date_format",
],
},
},
"to_character": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "to_character",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_to_char",
],
},
},
"z_to_char": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "to_character",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_to_char",
],
},
},
}
```
Meanwhile the roundtrip has this mapping:
```
[datafusion/core/src/execution/session_state.rs:2668:9]
roundtrip.scalar_functions() = {
"to_character": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "to_character",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_to_char",
],
},
},
"z_to_char": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "z_to_char",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_date_format",
],
},
},
"z_date_format": ScalarUDF {
inner: AliasedScalarUDFImpl {
inner: SimpleScalarUDF {
name: "z_to_char",
signature: Signature {
type_signature: Exact(
[
Utf8,
],
),
volatility: Immutable,
parameter_names: None,
},
return_type: Utf8,
fun: "<FUNC>",
},
aliases: [
"z_date_format",
],
},
},
}
```
See how `z_to_char` now points to the `z_to_char` UDF instead of the
`to_character` UDF.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]