comphead commented on code in PR #22473: URL: https://github.com/apache/datafusion/pull/22473#discussion_r3367967955
########## datafusion/sqllogictest/test_files/spark/array/arrays_zip.slt: ########## @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +## Spark `arrays_zip` returns a list of structs. Field naming rules +## (per SPARK-35876 / `DataFrameFunctionsSuite#"dataframe arrays_zip function"`): +## - Column reference -> use the column name (`a`, `b`, ...) +## - Aliased expression -> use the alias (`x`, `y`, ...) +## - Anything else (literals, -> use a 0-based ordinal (`0`, `1`, ...) +## function calls, etc.) +## +## DataFusion's native `arrays_zip` uses 1-based ordinals; the Spark variant is +## installed by `SessionStateBuilderSpark::with_spark_features` together with +## the analyzer-time `SparkArraysZipRewrite` rule that captures the field name +## for each argument before optimizer passes can rename it. + +# Spark docs example: arrays_zip(array(1, 2, 3), array(2, 3, 4), array(3, 4, 5)) +query ? +SELECT arrays_zip(array(1, 2, 3), array(2, 3, 4), array(3, 4, 5)); +---- +[{0: 1, 1: 2, 2: 3}, {0: 2, 1: 3, 2: 4}, {0: 3, 1: 4, 2: 5}] + +# Spark df1: equal-length integer arrays. +# Seq(9001, 9002, 9003), Seq(4, 5, 6) +query ? +SELECT arrays_zip(array(9001, 9002, 9003), array(4, 5, 6)); +---- +[{0: 9001, 1: 4}, {0: 9002, 1: 5}, {0: 9003, 1: 6}] + +# Spark df2: three arrays with mixed element types (string, boolean, int). +# Seq("a", "b"), Seq(true, false), Seq(10, 11) +query ? +SELECT arrays_zip(array('a', 'b'), array(true, false), array(10, 11)); +---- +[{0: a, 1: true, 2: 10}, {0: b, 1: false, 2: 11}] + +# Spark df3: shorter first array padded with NULL. +# Seq("a", "b"), Seq(4, 5, 6) +query ? +SELECT arrays_zip(array('a', 'b'), array(4, 5, 6)); +---- +[{0: a, 1: 4}, {0: b, 1: 5}, {0: NULL, 1: 6}] + +# Spark df4: NULL inside an array plus shorter second array. +# Seq("a", "b", null), Seq(4L) +query ? +SELECT arrays_zip(array('a', 'b', NULL), array(arrow_cast(4, 'Int64'))); +---- +[{0: a, 1: 4}, {0: b, 1: NULL}, {0: NULL, 1: NULL}] + +# Spark df5: four arrays exercising single-element, single-null, empty, and all-null cases. +# Seq(-1), Seq(null), Seq(), Seq(null, null) +query ? +SELECT arrays_zip( + array(-1), + array(arrow_cast(NULL, 'Int32')), + arrow_cast(make_array(), 'List(Int32)'), + array(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) +); +---- +[{0: -1, 1: NULL, 2: NULL, 3: NULL}, {0: NULL, 1: NULL, 2: NULL, 3: NULL}] + +# Spark df7: nested arrays zipped with doubles. +# Seq(Seq(1, 2, 3), Seq(4, 5)), Seq(1.1, 2.2) +query ? +SELECT arrays_zip(array(array(1, 2, 3), array(4, 5)), array(1.1, 2.2)); +---- +[{0: [1, 2, 3], 1: 1.1}, {0: [4, 5], 1: 2.2}] + +# SPARK-24633: arrays_zip with many single-element arrays merges into one row. +# (0 to 5).map(x => array(id + x)) on spark.range(1) → Row(Seq(Row(0, 1, 2, 3, 4, 5))) +query ? +SELECT arrays_zip(array(0), array(1), array(2), array(3), array(4), array(5)); +---- +[{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}] + +# Both arguments are NULL list → result is NULL. +query ? +SELECT arrays_zip(arrow_cast(NULL, 'List(Int32)'), arrow_cast(NULL, 'List(Int32)')); +---- +NULL + +# Single argument: still produces a 0-based struct. +query ? +SELECT arrays_zip(array(1, 2, 3)); +---- +[{0: 1}, {0: 2}, {0: 3}] + +# Column-level: multiple rows with different lengths. Struct fields take the +# column names from the outer SELECT (Spark SPARK-35876 semantics). +query ? +SELECT arrays_zip(a, b) +FROM (VALUES ([1, 2], [10, 20]), ([3, 4, 5], [30]), ([6], [60, 70])) AS t(a, b); +---- +[{a: 1, b: 10}, {a: 2, b: 20}] +[{a: 3, b: 30}, {a: 4, b: NULL}, {a: 5, b: NULL}] +[{a: 6, b: 60}, {a: NULL, b: 70}] + +# Column-level: NULL rows in the input. +query ? +SELECT arrays_zip(a, b) +FROM (VALUES ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) AS t(a, b); +---- +[{a: 1, b: 10}, {a: 2, b: 20}] +[{a: NULL, b: 30}, {a: NULL, b: 40}] +[{a: 5, b: NULL}, {a: 6, b: NULL}] + +# Aliased input columns: struct fields take the aliases the inner SELECT +# assigned. +query ? +SELECT arrays_zip(x, y) +FROM (SELECT a AS x, b AS y FROM (VALUES ([1, 2], [10, 20])) AS t(a, b)); +---- +[{x: 1, y: 10}, {x: 2, y: 20}] + +# Mixed: column reference + literal expression. Column keeps its name; the +# literal-built array falls back to a 0-based ordinal. +query ? Review Comment: thats a nice test, just checked Spark does the same ``` scala> spark.sql("select arrays_zip(a, array(3, 4, 5)) from (select array(1, 2, 3) a, array(3, 4, 5))").printSchema root |-- arrays_zip(a, array(3, 4, 5)): array (nullable = false) | |-- element: struct (containsNull = false) | | |-- a: integer (nullable = true) | | |-- 1: integer (nullable = true) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
