Re: [PR] [SPARK-42746][SQL] Implement LISTAGG function [spark]

via GitHub Wed, 20 Nov 2024 00:12:58 -0800


cloud-fan commented on code in PR #48748:
URL: https://github.com/apache/spark/pull/48748#discussion_r1849772666



##########
sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala:
##########
@@ -161,6 +161,267 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
     }
   }
 
+  private[this] def hexToBytes(s: String): Array[Byte] = {
+    val byteArray = BigInt(s, 16).toByteArray
+    if (byteArray.length > 1 && byteArray(0) == 0) {
+      // remove sign byte for positive numbers if exists
+      byteArray.tail
+    } else {
+      byteArray
+    }
+  }
+
+  test("listagg function") {
+    withTempView("df", "df2") {
+      Seq(("a", "b"), ("a", "c"), ("b", "c"), ("b", "d"), (null, 
null)).toDF("a", "b")
+        .createOrReplaceTempView("df")
+      checkAnswer(
+        sql("select listagg(b) from df group by a"),
+        Row(null) :: Row("bc") :: Row("cd") :: Nil)
+
+      checkAnswer(
+        sql("select string_agg(b) from df group by a"),
+        Row(null) :: Row("bc") :: Row("cd") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(b, null) from df group by a"),
+        Row(null) :: Row("bc") :: Row("cd") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(b) from df where 1 != 1"),
+        Row(null) :: Nil)
+
+      checkAnswer(
+        sql("select listagg(b, '|') from df group by a"),
+        Row("b|c") :: Row("c|d") :: Row(null) :: Nil)
+
+      checkAnswer(
+        spark.sql("select listagg(b, :param || ' ') from df group by a", 
Map("param" -> ",")),
+        Row("b, c") :: Row("c, d") :: Row(null) :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) from df"),
+        Row("aabb") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(distinct a) from df"),
+        Row("ab") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by a) from df"),
+        Row("aabb") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by a desc) from df"),
+        Row("bbaa") :: Nil)
+
+      checkAnswer(
+        sql("""select listagg(a) within group (order by a desc) over 
(partition by b) from df"""),
+        Row("a") :: Row("ba") :: Row("ba") :: Row("b") :: Row(null) :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by b) from df"),
+        Row("aabb") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by b desc) from df"),
+        Row("baba") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a, '|') within group (order by b desc) from df"),
+        Row("b|a|b|a") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by b desc, a asc) from df"),
+        Row("baba") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(a) within group (order by b desc, a desc) from 
df"),
+        Row("bbaa") :: Nil)
+
+      checkAnswer(
+        sql("select listagg(c1)from values (X'DEAD'), (X'BEEF') as t(c1)"),
+        Row(hexToBytes("DEADBEEF")) :: Nil)
+
+      checkAnswer(
+        sql("select listagg(c1, null)from values (X'DEAD'), (X'BEEF') as 
t(c1)"),
+        Row(hexToBytes("DEADBEEF")) :: Nil)
+
+      checkAnswer(
+        sql("select listagg(c1, X'42')from values (X'DEAD'), (X'BEEF') as 
t(c1)"),
+        Row(hexToBytes("DEAD42BEEF")) :: Nil)
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(c1) from values (array('a', 'b')) as t(c1)")
+        },
+        condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+        parameters = Map(
+          "sqlExpr" -> "\"listagg(c1, NULL)\"",
+          "paramIndex" -> "first",
+          "requiredType" -> "(\"STRING\" or \"BINARY\")",
+          "inputSql" -> "\"c1\"",
+          "inputType" -> "\"ARRAY<STRING>\""),
+        context = ExpectedContext(
+          fragment = "listagg(c1)",
+          start = 7,
+          stop = 17))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(c1, ', ')from values (X'DEAD'), (X'BEEF') as 
t(c1)")
+        },
+        condition = "DATATYPE_MISMATCH.DATA_DIFF_TYPES",
+        parameters = Map(
+          "sqlExpr" -> "\"listagg(c1, , )\"",
+          "functionName" -> "`listagg`",
+          "dataType" -> "(\"BINARY\" or \"STRING\")"),
+        context = ExpectedContext(
+          fragment = "listagg(c1, ', ')",
+          start = 7,
+          stop = 23))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(b, a) from df group by a")
+        },
+        condition = "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
+        parameters = Map(
+          "sqlExpr" -> "\"listagg(b, a)\"",
+          "inputName" -> "`delimiter`",
+          "inputType" -> "\"STRING\"",
+          "inputExpr" -> "\"a\""),
+        context = ExpectedContext(
+          fragment = "listagg(b, a)",
+          start = 7,
+          stop = 19))
+
+      checkAnswer(
+        sql("select listagg(a) over (order by a) from df"),
+        Row(null) :: Row("aa") :: Row("aa") :: Row("aabb") :: Row("aabb") :: 
Nil)
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(a) within group (order by a) over (order by a) 
from df")
+        },
+        condition = "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
+        parameters = Map("aggFunc" -> "\"listagg(a, NULL, a)\""),
+        context = ExpectedContext(
+          fragment = "listagg(a) within group (order by a) over (order by a)",
+          start = 7,
+          stop = 60))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select string_agg(a) within group (order by a) over (order by 
a) from df")
+        },
+        condition = "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
+        parameters = Map("aggFunc" -> "\"listagg(a, NULL, a)\""),
+        context = ExpectedContext(
+          fragment = "string_agg(a) within group (order by a) over (order by 
a)",
+          start = 7,
+          stop = 63))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(distinct a) over (order by a) from df")
+        },
+        condition = "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
+        parameters = Map("windowExpr" ->
+          ("\"listagg(DISTINCT a, NULL) " +
+          "OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING 
AND CURRENT ROW)\"")),
+        context = ExpectedContext(
+          fragment = "listagg(distinct a) over (order by a)",
+          start = 7,
+          stop = 43))
+
+      checkAnswer(
+        sql("select listagg(distinct a) within group (order by a DESC) from 
df"),
+        Row("ba") :: Nil)
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(distinct a) within group (order by b) from df")
+        },
+        condition = "FUNCTION_AND_ORDER_EXPRESSION_MISMATCH",
+        parameters = Map(
+          "functionName" -> "`listagg`",
+          "functionArgs" -> "\"a\"",
+          "orderExpr" -> "\"b\""),
+        context = ExpectedContext(
+          fragment = "listagg(distinct a) within group (order by b)",
+          start = 7,
+          stop = 51))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql("select listagg(distinct a) within group (order by a, b) from 
df")
+        },
+        condition = "FUNCTION_AND_ORDER_EXPRESSION_MISMATCH",
+        parameters = Map(
+          "functionName" -> "`listagg`",
+          "functionArgs" -> "\"a\"",
+          "orderExpr" -> "\"a\", \"b\""),
+        context = ExpectedContext(
+          fragment = "listagg(distinct a) within group (order by a, b)",
+          start = 7,
+          stop = 54))
+
+      Seq((1, true), (2, false), (3, false)).toDF("a", 
"b").createOrReplaceTempView("df2")
+
+      checkAnswer(
+        sql("select listagg(a), listagg(b, ',') from df2"),
+        Row("123", "true,false,false") :: Nil)
+    }
+  }
+
+  test("listagg collation test") {
+    checkAnswer(
+      sql("select listagg(c1) within group (order by c1 collate utf8_binary)" +
+        " from values ('a'), ('A'), ('b'), ('B') as t(c1)"),
+      Row("ABab") :: Nil)
+
+    checkAnswer(
+      sql("select listagg(c1) within group (order by c1 collate utf8_lcase)" +
+        " from values ('a'), ('A'), ('b'), ('B') as t(c1)"),
+      Row("aAbB") :: Nil)
+
+    checkAnswer(
+      sql("select listagg(DISTINCT c1 collate utf8_binary)" +
+        " from values ('a'), ('A'), ('b'), ('B') as t(c1)"),
+      Row("aAbB") :: Nil)
+
+    checkAnswer(
+      sql("select listagg(DISTINCT c1 collate utf8_lcase)" +
+        " from values ('a'), ('A'), ('b'), ('B') as t(c1)"),
+      Row("ab") :: Nil)
+
+    checkAnswer(
+      sql("select listagg(DISTINCT c1 collate utf8_lcase)" +
+        " within group (order by c1 collate utf8_lcase)" +
+        " from values ('a'), ('B'), ('b'), ('A') as t(c1)"),
+      Row("aB") :: Nil)
+
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql(
+          """select listagg(DISTINCT c1 collate utf8_lcase)
+            | within group (order by c1 collate utf8_binary)
+            | from values ('a'), ('b'), ('A'), ('B') as t(c1)""".stripMargin)

Review Comment:
   do we still need these tests if they are already in golden files?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-42746][SQL] Implement LISTAGG function [spark]

Reply via email to