alamb commented on code in PR #9980:
URL: https://github.com/apache/arrow-datafusion/pull/9980#discussion_r1556394912
##########
datafusion/functions-array/src/lib.rs:
##########
@@ -139,13 +326,16 @@ pub fn register_all(registry: &mut dyn FunctionRegistry)
-> Result<()> {
replace::array_replace_all_udf(),
replace::array_replace_udf(),
];
+
+ // TODO: Remove. Define in array_functions directly
functions.into_iter().try_for_each(|udf| {
Review Comment:
I think you could avoid the duplication here by looping over the elements of
`array_functions` and instantiating them all
For example (untested)
```rust
array_functions()
.values()
.try_for_each(|udf_factory| {
let udf = (udf_factory)();
..
});
```
##########
datafusion/functions-array/src/lib.rs:
##########
@@ -98,8 +103,190 @@ pub mod expr_fn {
pub use super::string::string_to_array;
}
+pub type ScalarFactory = Box<dyn Fn() -> Arc<ScalarUDF> + Send + Sync>;
+
+// HashMap Singleton for UDFs
+//
+// Replace register_all with our built-in functions
+// Replace scalar_functions: HashMap<String, Arc<ScalarUDF>> in SessionState
+pub fn array_functions() -> &'static Mutex<HashMap<String, ScalarFactory>> {
+ static FUNCTIONS: OnceLock<Mutex<HashMap<String, ScalarFactory>>> =
OnceLock::new();
+ FUNCTIONS.get_or_init(|| {
+ let mut functions = HashMap::new();
+ functions.insert(
+ String::from("array_to_string"),
+ Box::new(string::array_to_string_udf) as _,
+ );
+ functions.insert(
+ String::from("string_to_array"),
+ Box::new(string::string_to_array_udf) as _,
+ );
+ functions.insert(String::from("range"), Box::new(range::range_udf) as
_);
+ functions.insert(
+ String::from("gen_series"),
+ Box::new(range::gen_series_udf) as _,
+ );
+ functions.insert(
+ String::from("array_dims"),
+ Box::new(dimension::array_dims_udf) as _,
+ );
+ functions.insert(
+ String::from("cardinality"),
+ Box::new(cardinality::cardinality_udf) as _,
+ );
+ functions.insert(
+ String::from("array_ndims"),
+ Box::new(dimension::array_ndims_udf) as _,
+ );
+ functions.insert(
+ String::from("array_append"),
+ Box::new(concat::array_append_udf) as _,
+ );
+ functions.insert(
+ String::from("array_prepend"),
+ Box::new(concat::array_prepend_udf) as _,
+ );
+ functions.insert(
+ String::from("array_concat"),
+ Box::new(concat::array_concat_udf) as _,
+ );
+ functions.insert(
+ String::from("array_except"),
+ Box::new(except::array_except_udf) as _,
+ );
+ functions.insert(
+ String::from("array_element"),
+ Box::new(extract::array_element_udf) as _,
+ );
+ functions.insert(
+ String::from("array_pop_back"),
+ Box::new(extract::array_pop_back_udf) as _,
+ );
+ functions.insert(
+ String::from("array_pop_front"),
+ Box::new(extract::array_pop_front_udf) as _,
+ );
+ functions.insert(
+ String::from("array_slice"),
+ Box::new(extract::array_slice_udf) as _,
+ );
+ functions.insert(
+ String::from("make_array"),
+ Box::new(make_array::make_array_udf) as _,
+ );
+
+ // TODO: Rewrite for other functions as well
+ array_has_aliases().into_iter().for_each(|alias| {
+ functions.insert(alias, Box::new(array_has::array_has_udf) as _);
+ });
+
+ functions.insert(
+ String::from("array_has_all"),
+ Box::new(array_has::array_has_all_udf) as _,
+ );
+ functions.insert(
+ String::from("array_has_any"),
+ Box::new(array_has::array_has_any_udf) as _,
+ );
+ functions.insert(
+ String::from("array_empty"),
+ Box::new(empty::array_empty_udf) as _,
+ );
+ functions.insert(
+ String::from("array_length"),
+ Box::new(length::array_length_udf) as _,
+ );
+ functions.insert(String::from("flatten"),
Box::new(flatten::flatten_udf) as _);
+ functions.insert(
+ String::from("array_sort"),
+ Box::new(sort::array_sort_udf) as _,
+ );
+ functions.insert(
+ String::from("array_repeat"),
+ Box::new(repeat::array_repeat_udf) as _,
+ );
+ functions.insert(
+ String::from("array_resize"),
+ Box::new(resize::array_resize_udf) as _,
+ );
+ functions.insert(
+ String::from("array_reverse"),
+ Box::new(reverse::array_reverse_udf) as _,
+ );
+ functions.insert(
+ String::from("array_distinct"),
+ Box::new(set_ops::array_distinct_udf) as _,
+ );
+ functions.insert(
+ String::from("array_intersect"),
+ Box::new(set_ops::array_intersect_udf) as _,
+ );
+ functions.insert(
+ String::from("array_union"),
+ Box::new(set_ops::array_union_udf) as _,
+ );
+ functions.insert(
+ String::from("array_position"),
+ Box::new(position::array_position_udf) as _,
+ );
+ functions.insert(
+ String::from("array_positions"),
+ Box::new(position::array_positions_udf) as _,
+ );
+ functions.insert(
+ String::from("array_remove"),
+ Box::new(remove::array_remove_udf) as _,
+ );
+ functions.insert(
+ String::from("array_remove_all"),
+ Box::new(remove::array_remove_all_udf) as _,
+ );
+ functions.insert(
+ String::from("array_remove_n"),
+ Box::new(remove::array_remove_n_udf) as _,
+ );
+ functions.insert(
+ String::from("array_replace"),
+ Box::new(replace::array_replace_udf) as _,
+ );
+ functions.insert(
+ String::from("array_replace_all"),
+ Box::new(replace::array_replace_all_udf) as _,
+ );
+ functions.insert(
+ String::from("array_replace_n"),
+ Box::new(replace::array_replace_n_udf) as _,
+ );
+
+ // TODO: Add more builtin functions here
+ Mutex::new(functions)
+ })
+}
+
+// Get an UDF by name
+//
+// Replace with `get_udf`
+// fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
+// self.state.scalar_functions().get(name).cloned()
+// }
+pub fn get_array_udf(name: &str) -> Option<Arc<ScalarUDF>> {
+ array_functions().lock().unwrap().get(name).map(|f| f())
+}
+
+/// Register a single new UDF, so the user can register their own functions
+///
+/// Repalce old regsiter_udf
+pub fn register_array_udf(name: &str, udf: ScalarFactory) ->
Option<ScalarFactory> {
Review Comment:
I really don't like that this is some global mutable state that can be
changed by the user. I think this will be very confusing as it both may have
unintended side effects as well as not being clear if users should register
their functions with the SessionState or this global map
I think a much simpler to reason about interface would be that the list of
functions in `datafusion/functions` is immutable. For example something like
```rust
pub fn array_functions() -> &'static <HashMap<String, ScalarFactory> {
```
##########
datafusion/functions-array/src/lib.rs:
##########
@@ -98,8 +103,190 @@ pub mod expr_fn {
pub use super::string::string_to_array;
}
+pub type ScalarFactory = Box<dyn Fn() -> Arc<ScalarUDF> + Send + Sync>;
+
+// HashMap Singleton for UDFs
+//
+// Replace register_all with our built-in functions
+// Replace scalar_functions: HashMap<String, Arc<ScalarUDF>> in SessionState
+pub fn array_functions() -> &'static Mutex<HashMap<String, ScalarFactory>> {
Review Comment:
Should we name it to reflect the fact the map is of factories (and not
functions)?
Something like
```suggestion
pub fn array_scalar_factories() -> &'static Mutex<HashMap<String,
ScalarFactory>> {
```
🤔
##########
datafusion/functions-array/src/array_has.rs:
##########
@@ -53,6 +53,16 @@ make_udf_function!(ArrayHasAny,
array_has_any_udf // internal function name
);
+// TODO: Macro-ify aliases for all functions
+pub fn array_has_aliases() -> Vec<String> {
Review Comment:
the downside with this approach
1. is that now every time this function is called a new Vec of Strings gets
made which is likely close to the same cost as instantiating the struct in the
first place
2. This will will be more expensive as if the struct is ever actually called
3. There are now two Vecs of arrays that need to be kept in sync
##########
datafusion/core/src/execution/context/mod.rs:
##########
@@ -1329,6 +1329,8 @@ pub struct SessionState {
table_functions: HashMap<String, Arc<TableFunction>>,
/// Scalar functions that are registered with the context
scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+ // Need to implement Clone for this, so probably not a good idea.
+ // scalar_functions_impl: HashMap<String, ScalarFactory>,
Review Comment:
It isn't clear to me if @viirya 's usecase needs the defer udf creation in
all cases. If the goal is really to avoid instantiting *all* `ScalarUDF`s I
think a better approach would be to be to chang the signature of
`FunctionRegistry`
However, we may be able to avoid changing session state at all (and simply
have an API to get the function factories in addition to registering all
functions with the SessionState)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]