[GitHub] [spark] EnricoMi commented on a diff in pull request #36150: [SPARK-38864][SQL] Add unpivot / melt to Dataset

GitBox Mon, 18 Jul 2022 03:20:49 -0700


EnricoMi commented on code in PR #36150:
URL: https://github.com/apache/spark/pull/36150#discussion_r923187336



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala:
##########
@@ -863,6 +868,36 @@ class Analyzer(override val catalogManager: CatalogManager)
     }
   }
 
+  object ResolveMelt extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = 
plan.resolveOperatorsWithPruning(
+      _.containsPattern(MELT), ruleId) {
+
+      // once children and ids are resolved, we can determine values, if non 
were given
+      case m: Melt if m.childrenResolved && m.ids.forall(_.resolved) && 
m.values.isEmpty =>
+        m.copy(values = m.child.output.diff(m.ids))

Review Comment:
   Now I got it:
   ```
   df.unpivot(Array($"id" * 2), Array.empty, "var", "val")
   df.unpivot(Array($"id".as("uid")), Array.empty, "var", "val")
   ```
   Yes, all columns will be unpivoted (become values), including the "id" 
column.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala:
##########
@@ -1245,6 +1245,44 @@ case class Pivot(
   override protected def withNewChildInternal(newChild: LogicalPlan): Pivot = 
copy(child = newChild)
 }
 
+/**
+ * A constructor for creating an Unpivot, which will later be converted to an 
[[Expand]]
+ * during the query analysis.
+ *
+ * An empty values array will be replaced during analysis with all resolved 
outputs of child except
+ * the ids. This expansion allows to easily unpivot all non-id columns.
+ *
+ * @see `org.apache.spark.sql.catalyst.analysis.Analyzer.ResolveUnpivot`
+ *
+ * The type of the value column is derived from all value columns during 
analysis once all values
+ * are resolved. All values' types have to be compatible, otherwise the result 
value column cannot
+ * be assigned the individual values and an AnalysisException is thrown.
+ *
+ * @see 
`org.apache.spark.sql.catalyst.analysis.TypeCoercionBase.UnpivotCoercion`
+ *
+ * @param ids                Id columns
+ * @param values             Value columns to unpivot
+ * @param variableColumnName Name of the variable column
+ * @param valueColumnName    Name of the value column
+ * @param valueType          Type of value column once known
+ * @param child              Child operator
+ */
+case class Unpivot(
+    ids: Seq[NamedExpression],
+    values: Seq[NamedExpression],
+    variableColumnName: String,
+    valueColumnName: String,
+    valueType: Option[DataType],

Review Comment:
   Right, the valueType is not needed as it can be derived from 
`values.head.dataType` as `values` is never empty once resolved. I have 
reworked that: 61c8a3b36ddbbb05e8c066c05b78724eaae201cf



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala:
##########
@@ -92,6 +92,19 @@ private[sql] object QueryCompilationErrors extends 
QueryErrorsBase {
         pivotVal.toString, pivotVal.dataType.simpleString, 
pivotCol.dataType.catalogString))
   }
 
+  def unpivotRequiresValueColumns(ids: Seq[NamedExpression]): Throwable = {
+    new AnalysisException(
+      errorClass = "UNPIVOT_REQUIRES_VALUE_COLUMNS",
+      messageParameters = Array(ids.map(id => 
toSQLId(id.toString)).mkString(", ")))
+  }
+
+  def unpivotValDataTypeMismatchError(values: Seq[NamedExpression]): Throwable 
= {
+    val dataTypes = values.map(_.dataType).toSet.map((dt: DataType) => 
toSQLType(dt))

Review Comment:
   Added three columns per datatype to the error: 
ad8b5d3ccfb7b31a0577fe2fe042d16ee14e9c79



##########
sql/core/src/test/scala/org/apache/spark/sql/DatasetUnpivotSuite.scala:
##########
@@ -0,0 +1,488 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.errors.QueryErrorsSuiteBase
+import org.apache.spark.sql.functions.{length, struct, sum}
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
+
+/**
+ * Comprehensive tests for Dataset.unpivot.
+ */
+class DatasetUnpivotSuite extends QueryTest
+  with QueryErrorsSuiteBase
+  with SharedSparkSession {
+  import testImplicits._
+
+  lazy val wideDataDs: Dataset[WideData] = Seq(
+    WideData(1, "one", "One", Some(1), Some(1L)),
+    WideData(2, "two", null, None, Some(2L)),
+    WideData(3, null, "three", Some(3), None),
+    WideData(4, null, null, None, None)
+  ).toDS()
+
+  val longDataRows = Seq(
+    Row(1, "str1", "one"),
+    Row(1, "str2", "One"),
+    Row(2, "str1", "two"),
+    Row(2, "str2", null),
+    Row(3, "str1", null),
+    Row(3, "str2", "three"),
+    Row(4, "str1", null),
+    Row(4, "str2", null)
+  )
+
+  val longDataWithoutIdRows: Seq[Row] =
+    longDataRows.map(row => Row(row.getString(1), row.getString(2)))
+
+  val longSchema: StructType = StructType(Seq(
+    StructField("id", IntegerType, nullable = false),
+    StructField("var", StringType, nullable = false),
+    StructField("val", StringType, nullable = true)
+  ))
+
+  lazy val wideStructDataDs: DataFrame = wideDataDs.select(
+    struct($"id").as("an"),
+    struct(
+      $"str1".as("one"),
+      $"str2".as("two")
+    ).as("str")
+  )
+  val longStructDataRows: Seq[Row] = longDataRows.map(row =>
+    Row(
+      row.getInt(0),
+      row.getString(1) match {
+        case "str1" => "one"
+        case "str2" => "two"
+      },
+      row.getString(2))
+  )
+
+  test("overloaded unpivot without values") {
+    val ds = wideDataDs.select($"id", $"str1", $"str2")
+    checkAnswer(
+      ds.unpivot(Array($"id"), "var", "val"),
+      ds.unpivot(Array($"id"), Array.empty, "var", "val"))
+  }
+
+  test("unpivot with single id") {
+    val unpivoted = wideDataDs
+      .unpivot(
+        Array($"id"),
+        Array($"str1", $"str2"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === longSchema)
+    checkAnswer(unpivoted, longDataRows)
+  }
+
+  test("unpivot with two ids") {
+    val unpivotedRows = Seq(
+      Row(1, 1, "str1", "one"),
+      Row(1, 1, "str2", "One"),
+      Row(2, null, "str1", "two"),
+      Row(2, null, "str2", null),
+      Row(3, 3, "str1", null),
+      Row(3, 3, "str2", "three"),
+      Row(4, null, "str1", null),
+      Row(4, null, "str2", null))
+
+    val unpivoted = wideDataDs
+      .unpivot(
+        Array($"id", $"int1"),
+        Array($"str1", $"str2"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("int1", IntegerType, nullable = true),
+      StructField("var", StringType, nullable = false),
+      StructField("val", StringType, nullable = true))))
+    checkAnswer(unpivoted, unpivotedRows)
+  }
+
+  test("unpivot without ids") {
+    val unpivoted = wideDataDs
+      .unpivot(
+        Array.empty,
+        Array($"str1", $"str2"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("var", StringType, nullable = false),
+      StructField("val", StringType, nullable = true))))
+    checkAnswer(unpivoted, longDataWithoutIdRows)
+  }
+
+  test("unpivot without values") {
+    val unpivoted = wideDataDs.select($"id", $"str1", $"str2")
+      .unpivot(
+        Array($"id"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === longSchema)
+    checkAnswer(unpivoted, longDataRows)
+
+    val unpivoted2 = wideDataDs.select($"id", $"str1", $"str2")
+      .unpivot(
+        Array($"id"),
+        Array.empty,
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted2.schema === longSchema)
+    checkAnswer(unpivoted2, longDataRows)
+  }
+
+  test("unpivot without ids or values") {
+    val unpivoted = wideDataDs.select($"str1", $"str2")
+      .unpivot(
+        Array.empty,
+        Array.empty,
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("var", StringType, nullable = false),
+      StructField("val", StringType, nullable = true))))
+    checkAnswer(unpivoted, longDataWithoutIdRows)
+  }
+
+  test("unpivot with star values") {
+    val unpivoted = wideDataDs.select($"str1", $"str2")
+      .unpivot(
+        Array.empty,
+        Array($"*"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("var", StringType, nullable = false),
+      StructField("val", StringType, nullable = true))))
+    checkAnswer(unpivoted, longDataWithoutIdRows)
+  }
+
+  test("unpivot with id and star values") {
+    val unpivoted = wideDataDs.select($"id", $"int1", $"long1")
+      .unpivot(
+        Array($"id"),
+        Array($"*"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("var", StringType, nullable = false),
+      StructField("val", LongType, nullable = true))))
+
+    checkAnswer(unpivoted, wideDataDs.collect().flatMap { row => Seq(
+      Row(row.id, "id", row.id),
+      Row(row.id, "int1", row.int1.orNull),
+      Row(row.id, "long1", row.long1.orNull)
+    )})
+  }
+
+  test("unpivot with expressions") {
+    // ids and values are all expressions (computed)
+    val unpivoted = wideDataDs
+      .unpivot(
+        Array(($"id" * 10).as("primary"), $"str1".as("secondary")),
+        Array(($"int1" + $"long1").as("sum"), length($"str2").as("len")),
+        variableColumnName = "var",
+        valueColumnName = "val")
+
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("primary", IntegerType, nullable = false),
+      StructField("secondary", StringType, nullable = true),
+      StructField("var", StringType, nullable = false),
+      StructField("val", LongType, nullable = true))))
+
+    checkAnswer(unpivoted, wideDataDs.collect().flatMap { row =>
+      Seq(
+        Row(
+          row.id * 10,
+          row.str1,
+          "sum",
+          // sum of int1 and long1 when both are set, or null otherwise
+          row.int1.flatMap(i => row.long1.map(l => i + l)).orNull),
+        Row(
+          row.id * 10,
+          row.str1,
+          "len",
+          // length of str2 if set, or null otherwise
+          Option(row.str2).map(_.length).orNull)
+      )
+    })
+  }
+
+  test("unpivot with variable / value columns") {
+    // with value column `variable` and `value`
+    val unpivoted = wideDataDs
+      .withColumnRenamed("str1", "var")
+      .withColumnRenamed("str2", "val")
+      .unpivot(
+        Array($"id"),
+        Array($"var", $"val"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    checkAnswer(unpivoted, longDataRows.map(row => Row(
+      row.getInt(0),
+      row.getString(1) match {
+        case "str1" => "var"
+        case "str2" => "val"
+      },
+      row.getString(2))))
+  }
+
+  test("unpivot with incompatible value types") {
+    val e = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array($"id"),
+        Array($"str1", $"int1"),
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e,
+      errorClass = "UNPIVOT_VALUE_DATA_TYPE_MISMATCH",
+      msg = "Unpivot value columns must share a least common type, " +
+        "some types do not: \\[\"STRING\", \"INT\"\\];(\n.*)*",
+      matchMsg = true)
+  }
+
+  test("unpivot with compatible value types") {
+    val unpivoted = wideDataDs.unpivot(
+      Array($"id"),
+      Array($"int1", $"long1"),
+      variableColumnName = "var",
+      valueColumnName = "val")
+    assert(unpivoted.schema === StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("var", StringType, nullable = false),
+      StructField("val", LongType, nullable = true)
+    )))
+
+    val unpivotedRows = Seq(
+      Row(1, "int1", 1L),
+      Row(1, "long1", 1L),
+      Row(2, "int1", null),
+      Row(2, "long1", 2L),
+      Row(3, "int1", 3L),
+      Row(3, "long1", null),
+      Row(4, "int1", null),
+      Row(4, "long1", null)
+    )
+    checkAnswer(unpivoted, unpivotedRows)
+  }
+
+  test("unpivot and drop nulls") {
+    checkAnswer(
+      wideDataDs
+        .unpivot(Array($"id"), Array($"str1", $"str2"), "var", "val")
+        .where($"val".isNotNull),
+      longDataRows.filter(_.getString(2) != null))
+  }
+
+  test("unpivot with invalid arguments") {
+    // unpivoting where id column does not exist
+    val e1 = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array($"1", $"2"),
+        Array($"str1", $"str2"),
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e1,
+      errorClass = "UNRESOLVED_COLUMN",
+      msg = "A column or function parameter with name `1` cannot be 
resolved\\. " +
+        "Did you mean one of the following\\? \\[`id`, `int1`, `str1`, `str2`, 
`long1`\\];(\n.*)*",
+      matchMsg = true)
+
+    // unpivoting where value column does not exist
+    val e2 = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array($"id"),
+        Array($"does", $"not", $"exist"),
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e2,
+      errorClass = "UNRESOLVED_COLUMN",
+      msg = "A column or function parameter with name `does` cannot be 
resolved\\. " +
+        "Did you mean one of the following\\? \\[`id`, `int1`, `long1`, 
`str1`, `str2`\\];(\n.*)*",
+      matchMsg = true)
+
+    // unpivoting with empty list of value columns
+    // where potential value columns are of incompatible types
+    val e3 = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array.empty,
+        Array.empty,
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e3,
+      errorClass = "UNPIVOT_VALUE_DATA_TYPE_MISMATCH",
+      msg = "Unpivot value columns must share a least common type, " +
+        "some types do not: \\[\"INT\", \"STRING\", \"BIGINT\"\\];(\n.*)*",
+      matchMsg = true)
+
+    // unpivoting with star id columns so that no value columns are left
+    val e4 = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array($"*"),
+        Array.empty,
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e4,
+      errorClass = "UNPIVOT_REQUIRES_VALUE_COLUMNS",
+      msg = "At least one value column needs to be specified for UNPIVOT, " +
+        "all columns specified as ids;(\\n.*)*",
+      matchMsg = true)
+
+    // unpivoting with star value columns
+    // where potential value columns are of incompatible types
+    val e5 = intercept[AnalysisException] {
+      wideDataDs.unpivot(
+        Array.empty,
+        Array($"*"),
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e5,
+      errorClass = "UNPIVOT_VALUE_DATA_TYPE_MISMATCH",
+      msg = "Unpivot value columns must share a least common type, " +
+        "some types do not: \\[\"INT\", \"STRING\", \"BIGINT\"\\];(\n.*)*",
+      matchMsg = true)
+
+    // unpivoting without giving values and no non-id columns
+    val e6 = intercept[AnalysisException] {
+      wideDataDs.select($"id", $"str1", $"str2").unpivot(
+        Array($"id", $"str1", $"str2"),
+        Array.empty,
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e6,
+      errorClass = "UNPIVOT_REQUIRES_VALUE_COLUMNS",
+      msg = "At least one value column needs to be specified for UNPIVOT, " +
+        "all columns specified as ids;(\\n.*)*",
+      matchMsg = true)
+  }
+
+  test("unpivot after pivot") {
+    // see test "pivot courses" in DataFramePivotSuite
+    val pivoted = courseSales.groupBy("year").pivot("course", Array("dotNET", 
"Java"))
+      .agg(sum($"earnings"))
+    val unpivoted = pivoted.unpivot(Array($"year"), "course", "earnings")
+    val expected = courseSales.groupBy("year", "course").sum("earnings")
+    checkAnswer(unpivoted, expected)
+  }
+
+  test("unpivot of unpivot") {
+    checkAnswer(
+      wideDataDs
+        .unpivot(Array($"id"), Array($"str1", $"str2"), "var", "val")
+        .unpivot(Array($"id"), Array($"var", $"val"), "col", "value"),
+      longDataRows.flatMap(row => Seq(
+        Row(row.getInt(0), "var", row.getString(1)),
+        Row(row.getInt(0), "val", row.getString(2)))))
+  }
+
+  test("unpivot with dot and backtick") {
+    val ds = wideDataDs
+      .withColumnRenamed("id", "an.id")
+      .withColumnRenamed("str1", "str.one")
+      .withColumnRenamed("str2", "str.two")
+
+    val unpivoted = ds.unpivot(
+        Array($"`an.id`"),
+        Array($"`str.one`", $"`str.two`"),
+        variableColumnName = "var",
+        valueColumnName = "val")
+    checkAnswer(unpivoted, longDataRows.map(row => Row(
+        row.getInt(0),
+        row.getString(1) match {
+          case "str1" => "str.one"
+          case "str2" => "str.two"
+        },
+        row.getString(2))))
+
+    // without backticks, this references struct fields, which do not exist
+    val e = intercept[AnalysisException] {
+      ds.unpivot(
+        Array($"an.id"),
+        Array($"str.one", $"str.two"),
+        variableColumnName = "var",
+        valueColumnName = "val"
+      )
+    }
+    checkErrorClass(
+      exception = e,
+      errorClass = "UNRESOLVED_COLUMN",
+      // expected message is wrong: 
https://issues.apache.org/jira/browse/SPARK-39783
+      msg = "A column or function parameter with name `an`\\.`id` cannot be 
resolved\\. " +
+        "Did you mean one of the following\\? " +
+        "\\[`an`.`id`, `int1`, `long1`, `str`.`one`, `str`.`two`\\];(\n.*)*",
+      matchMsg = true)
+  }
+
+  test("unpivot with struct fields") {
+    checkAnswer(
+      wideStructDataDs.unpivot(
+        Array($"an.id"),
+        Array($"str.one", $"str.two"),
+        "var",
+        "val"),
+      longStructDataRows)
+  }
+
+  test("unpivot with struct ids star") {
+    checkAnswer(
+      wideStructDataDs.unpivot(
+        Array($"an.*"),
+        Array($"str.one", $"str.two"),
+        "var",
+        "val"),
+      longStructDataRows)
+  }
+
+  test("unpivot with struct values star") {
+    checkAnswer(
+      wideStructDataDs.unpivot(
+        Array($"an.id"),
+        Array($"str.*"),
+        "var",
+        "val"),
+      longStructDataRows)
+  }
+}

Review Comment:
   added: 666cbbb6ef663b93aaef347e7c77443681d280b6



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] EnricoMi commented on a diff in pull request #36150: [SPARK-38864][SQL] Add unpivot / melt to Dataset

Reply via email to