Jefffrey commented on code in PR #20440:
URL: https://github.com/apache/datafusion/pull/20440#discussion_r2832258368
##########
datafusion/functions-nested/src/set_ops.rs:
##########
@@ -64,6 +65,13 @@ make_udf_expr_and_func!(
array_distinct_udf
);
+make_udf_expr_and_func!(
Review Comment:
Probably put this in a separate file? Not sure it fits with the set operator
functions
##########
datafusion/functions-nested/src/set_ops.rs:
##########
@@ -582,6 +590,254 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
)?))
}
+#[user_doc(
+ doc_section(label = "Array Functions"),
+ description = "Returns an array of structs created by combining the
elements of each input array at the same index. If the arrays have different
lengths, shorter arrays are padded with NULLs.",
+ syntax_example = "arrays_zip(array1, array2[, ..., array_n])",
+ sql_example = r#"```sql
+> select arrays_zip([1, 2, 3], ['a', 'b', 'c']);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3], ['a', 'b', 'c']) |
++---------------------------------------------------+
+| [{c0: 1, c1: a}, {c0: 2, c1: b}, {c0: 3, c1: c}] |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5]) |
++---------------------------------------------------+
+| [{c0: 1, c1: 3}, {c0: 2, c1: 4}, {c0: , c1: 5}] |
++---------------------------------------------------+
+```"#,
+ argument(name = "array1", description = "First array expression."),
+ argument(name = "array2", description = "Second array expression."),
+ argument(name = "array_n", description = "Subsequent array expressions.")
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArraysZip {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl Default for ArraysZip {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl ArraysZip {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::variadic_any(Volatility::Immutable),
+ aliases: vec![String::from("list_zip")],
+ }
+ }
+}
+
+impl ScalarUDFImpl for ArraysZip {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "arrays_zip"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ if arg_types.is_empty() {
+ return exec_err!("arrays_zip requires at least two arguments");
+ }
+
+ let mut fields = Vec::with_capacity(arg_types.len());
+ for (i, arg_type) in arg_types.iter().enumerate() {
+ let element_type = match arg_type {
+ List(field) | LargeList(field) => field.data_type().clone(),
Review Comment:
Do we need to consider fixedsizelists for completeness?
##########
datafusion/functions-nested/src/set_ops.rs:
##########
@@ -582,6 +590,254 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
)?))
}
+#[user_doc(
+ doc_section(label = "Array Functions"),
+ description = "Returns an array of structs created by combining the
elements of each input array at the same index. If the arrays have different
lengths, shorter arrays are padded with NULLs.",
+ syntax_example = "arrays_zip(array1, array2[, ..., array_n])",
+ sql_example = r#"```sql
+> select arrays_zip([1, 2, 3], ['a', 'b', 'c']);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3], ['a', 'b', 'c']) |
++---------------------------------------------------+
+| [{c0: 1, c1: a}, {c0: 2, c1: b}, {c0: 3, c1: c}] |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5]) |
++---------------------------------------------------+
+| [{c0: 1, c1: 3}, {c0: 2, c1: 4}, {c0: , c1: 5}] |
++---------------------------------------------------+
+```"#,
+ argument(name = "array1", description = "First array expression."),
+ argument(name = "array2", description = "Second array expression."),
+ argument(name = "array_n", description = "Subsequent array expressions.")
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArraysZip {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl Default for ArraysZip {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl ArraysZip {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::variadic_any(Volatility::Immutable),
+ aliases: vec![String::from("list_zip")],
+ }
+ }
+}
+
+impl ScalarUDFImpl for ArraysZip {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "arrays_zip"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ if arg_types.is_empty() {
+ return exec_err!("arrays_zip requires at least two arguments");
+ }
+
+ let mut fields = Vec::with_capacity(arg_types.len());
+ for (i, arg_type) in arg_types.iter().enumerate() {
+ let element_type = match arg_type {
+ List(field) | LargeList(field) => field.data_type().clone(),
+ Null => Null,
+ dt => {
+ return exec_err!("arrays_zip expects array arguments, got
{dt}");
+ }
+ };
+ fields.push(Field::new(format!("c{i}"), element_type, true));
+ }
+
+ Ok(List(Arc::new(Field::new_list_field(
+ DataType::Struct(Fields::from(fields)),
+ true,
+ ))))
+ }
+
+ fn invoke_with_args(
+ &self,
+ args: datafusion_expr::ScalarFunctionArgs,
+ ) -> Result<ColumnarValue> {
+ make_scalar_function(arrays_zip_inner)(&args.args)
+ }
+
+ fn aliases(&self) -> &[String] {
+ &self.aliases
+ }
+
+ fn documentation(&self) -> Option<&Documentation> {
+ self.doc()
+ }
+}
+
+/// Core implementation for arrays_zip.
+///
+/// Takes N list arrays and produces a list of structs where each struct
+/// has one field per input array. If arrays within a row have different
+/// lengths, shorter arrays are padded with NULLs.
+fn arrays_zip_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+ if args.len() < 2 {
+ return exec_err!("arrays_zip requires at least two arguments");
+ }
+
+ let num_rows = args[0].len();
+
+ // Extract element type from each list argument
+ let mut list_arrays: Vec<Option<&ListArray>> =
Vec::with_capacity(args.len());
+ let mut element_types: Vec<DataType> = Vec::with_capacity(args.len());
+
+ for (i, arg) in args.iter().enumerate() {
+ match arg.data_type() {
+ List(field) => {
+ let list_arr = as_list_array(arg)?;
+ element_types.push(field.data_type().clone());
+ list_arrays.push(Some(list_arr));
+ }
+ Null => {
+ element_types.push(Null);
+ list_arrays.push(None);
+ }
+ dt => {
Review Comment:
We accept largelists in `return_type` but here we don't support them
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]