[GitHub] [spark] HyukjinKwon commented on a diff in pull request #36150: [SPARK-38864][SQL] Add melt / unpivot to Dataset

GitBox Tue, 24 May 2022 17:23:34 -0700


HyukjinKwon commented on code in PR #36150:
URL: https://github.com/apache/spark/pull/36150#discussion_r881077122



##########
sql/core/src/test/scala/org/apache/spark/sql/MeltSuite.scala:
##########
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType, 
StructField, StructType}
+
+/**
+ * Comprehensive tests for Melt.of(), which is used by Dataset.melt.
+ */
+class MeltSuite extends QueryTest
+  with SharedSparkSession {
+  import testImplicits._
+
+  lazy val meltWideDataDs: Dataset[WideData] = Seq(
+    WideData(1, "one", "One", Some(1), Some(1L)),
+    WideData(2, "two", null, None, Some(2L)),
+    WideData(3, null, "three", Some(3), None),
+    WideData(4, null, null, None, None)
+  ).toDS()
+
+  val meltedWideDataRows = Seq(
+    Row(1, "str1", "one"),
+    Row(1, "str2", "One"),
+    Row(2, "str1", "two"),
+    Row(2, "str2", null),
+    Row(3, "str1", null),
+    Row(3, "str2", "three"),
+    Row(4, "str1", null),
+    Row(4, "str2", null)
+  )
+
+  val meltedWideDataWithoutIdRows: Seq[Row] =
+    meltedWideDataRows.map(row => Row(row.getString(1), row.getString(2)))
+
+  private def assertException[T <: Exception : ClassTag](func: => 
Any)(message: String): Unit = {
+    val exception = intercept[T] { func }
+    assert(exception.getMessage === message)
+  }
+
+  test("melt without ids or values") {
+    // do not drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"str1", $"str2"), Seq.empty),
+      meltedWideDataWithoutIdRows
+    )
+
+    // drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"str1", $"str2"), Seq.empty, dropNulls = 
true),
+      meltedWideDataWithoutIdRows.filter(row => !row.isNullAt(1))
+    )
+  }
+
+  test("melt without ids") {
+    // do not drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"str1", $"str2"), Seq.empty, Seq("str1", 
"str2")),
+      meltedWideDataWithoutIdRows
+    )
+
+    // drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"str1", $"str2"),
+        Seq.empty, Seq("str1", "str2"), dropNulls = true),
+      meltedWideDataWithoutIdRows.filter(row => !row.isNullAt(1))
+    )
+  }
+
+  test("melt with single id") {
+    // do not drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs, Seq("id"), Seq("str1", "str2")),
+      meltedWideDataRows
+    )
+
+    // drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs, Seq("id"), Seq("str1", "str2"), dropNulls = 
true),
+      meltedWideDataRows.filter(row => !row.isNullAt(2))
+    )
+  }
+
+  test("melt with two ids") {
+    val meltedRows = Seq(
+      Row(1, 1, "str1", "one"),
+      Row(1, 1, "str2", "One"),
+      Row(2, null, "str1", "two"),
+      Row(2, null, "str2", null),
+      Row(3, 3, "str1", null),
+      Row(3, 3, "str2", "three"),
+      Row(4, null, "str1", null),
+      Row(4, null, "str2", null)
+    )
+
+    // do not drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs, Seq("id", "int1"), Seq("str1", "str2")),
+      meltedRows
+    )
+
+    // drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs, Seq("id", "int1"), Seq("str1", "str2"), 
dropNulls = true),
+      meltedRows.filter(row => !row.isNullAt(3))
+    )
+  }
+
+  test("melt without values") {
+    // do not drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"id", $"str1", $"str2"), Seq("id")),
+      meltedWideDataRows
+    )
+
+    // do drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs.select($"id", $"str1", $"str2"), Seq("id"), 
dropNulls = true),
+      meltedWideDataRows.filter(row => !row.isNullAt(2))
+    )
+  }
+
+  test("melt with variable / value value columns") {
+    // with value column `variable` and `value`
+    checkAnswer(
+      Melt.of(meltWideDataDs.withColumnRenamed("str1", "variable")
+        .withColumnRenamed("str2", "value"),
+        Seq("id"), Seq("variable", "value")),
+      meltedWideDataRows.map(row => Row(
+        row.getInt(0),
+        row.getString(1) match {
+          case "str1" => "variable"
+          case "str2" => "value"
+        },
+        row.getString(2)
+      ))
+    )
+
+    // with un-referenced column `variable` and `value`
+    checkAnswer(
+      Melt.of(meltWideDataDs.withColumnRenamed("int1", "variable")
+        .withColumnRenamed("long1", "value"),
+        Seq("id"), Seq("str1", "str2")),
+      meltedWideDataRows
+    )
+  }
+
+  test("melt with incompatible value types") {
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs, Seq("id"), Seq("str1", "int1"))
+    }("All values must be of compatible types, " +
+      "types StringType and IntegerType are not compatible")
+  }
+
+  test("melt with compatible value types") {
+    val df = Melt.of(meltWideDataDs, Seq("id"), Seq("int1", "long1"))
+
+    assert(df.schema === StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("variable", StringType, nullable = false),
+      StructField("value", LongType, nullable = true)
+    )))
+
+    val meltedRows = Seq(
+      Row(1, "int1", 1L),
+      Row(1, "long1", 1L),
+      Row(2, "int1", null),
+      Row(2, "long1", 2L),
+      Row(3, "int1", 3L),
+      Row(3, "long1", null),
+      Row(4, "int1", null),
+      Row(4, "long1", null)
+    )
+
+    // do not drop nulls
+    checkAnswer(
+      df,
+      meltedRows
+    )
+
+    // drop nulls
+    checkAnswer(
+      Melt.of(meltWideDataDs, Seq("id"), Seq("int1", "long1"), dropNulls = 
true),
+      meltedRows.filter(row => !row.isNullAt(2))
+    )
+  }
+
+  test("melt with invalid arguments") {
+    // melting with column in both ids and values
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs, Seq("str1", "int1"), Seq("str1", "str2", "int1", 
"long1"))
+    }("A column cannot be both an id and a value column: str1, int1")
+
+    // melting with empty list of value columns
+    // where potential value columns are of incompatible types
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs, Seq.empty, Seq.empty)
+    }("All values must be of compatible types, " +
+      "types StringType and LongType are not compatible")
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs, Seq("id"), Seq.empty)
+    }("All values must be of compatible types, " +
+      "types StringType and IntegerType are not compatible")
+
+    // melting without giving values and no non-id columns
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs.select("id"), Seq("id"))
+    }("The dataset has no non-id columns to melt")
+
+    // melting with id column `variable`
+    assertException[IllegalArgumentException] {
+      Melt.of(
+        meltWideDataDs.withColumnRenamed("id", "variable"),
+        Seq("variable"), Seq("str1", "str2")
+      )
+    }("Column name for variable column (variable) must not exist among id 
columns: variable")
+    checkAnswer(
+      Melt.of(meltWideDataDs.withColumnRenamed("id", "variable"),
+        Seq("variable"), Seq("str1", "str2"), variableColumnName = "var", 
valueColumnName = "val"),
+      meltedWideDataRows
+    )
+
+    // melting with id column `value`
+    assertException[IllegalArgumentException] {
+      Melt.of(meltWideDataDs.withColumnRenamed("id", "value"),
+        Seq("value"), Seq("str1", "str2"))
+    }("Column name for value column (value) must not exist among id columns: 
value")
+    checkAnswer(
+      Melt.of(meltWideDataDs.withColumnRenamed("id", "value"),
+        Seq("value"), Seq("str1", "str2"), variableColumnName = "var", 
valueColumnName = "val"),
+      meltedWideDataRows
+    )
+
+  }
+
+  test("melt with dot and backtick") {
+    val df = meltWideDataDs
+      .withColumnRenamed("id", "an.id")
+      .withColumnRenamed("str1", "str.one")
+      .withColumnRenamed("str2", "str.two")
+    checkAnswer(
+      Melt.of(df, Seq("`an.id`"), Seq("`str.one`", "`str.two`")),
+      meltedWideDataRows.map(row => Row(
+        row.getInt(0),
+        row.getString(1) match {
+          case "str1" => "str.one"
+          case "str2" => "str.two"
+        },
+        row.getString(2)
+      ))
+    )
+
+    // without backticks, this references struct fields, which do not exist
+    assertException[IllegalArgumentException] {
+      Melt.of(df, Seq("an.id"), Seq("str.one", "str.two")).collect()
+    }("Unknown columns: an.id, str.one, str.two, dataset has: an.id, str.one, 
str.two, int1, long1")
+  }
+
+  /** TODO: Would be nice to melt on struct fields.

Review Comment:
   Let's file a separate JIRA for this, and make it to `TODO(SPARK-XXXX): ... `



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] HyukjinKwon commented on a diff in pull request #36150: [SPARK-38864][SQL] Add melt / unpivot to Dataset

Reply via email to