spark git commit: [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows

lian Thu, 26 May 2016 16:24:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master fe6de16f7 -> e7082caeb



[SPARK-15550][SQL] Dataset.show() should show contents nested products as rows

## What changes were proposed in this pull request?

This PR addresses two related issues:

1. `Dataset.showString()` should show case classes/Java beans at all levels as 
rows, while master code only handles top level ones.

2. `Dataset.showString()` should show full contents produced the underlying 
query plan

   Dataset is only a view of the underlying query plan. Columns not referred by 
the encoder are still reachable using methods like `Dataset.col`. So it 
probably makes more sense to show full contents of the query plan.

## How was this patch tested?

Two new test cases are added in `DatasetSuite` to check `.showString()` output.

Author: Cheng Lian <l...@databricks.com>

Closes #13331 from liancheng/spark-15550-ds-show.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7082cae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7082cae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7082cae

Branch: refs/heads/master
Commit: e7082caeb4a53c1ee172d136894eece1ac880f65
Parents: fe6de16
Author: Cheng Lian <l...@databricks.com>
Authored: Thu May 26 16:23:48 2016 -0700
Committer: Cheng Lian <l...@databricks.com>
Committed: Thu May 26 16:23:48 2016 -0700

----------------------------------------------------------------------
 .../scala/org/apache/spark/sql/Dataset.scala    | 10 +--
 .../org/apache/spark/sql/DatasetSuite.scala     | 68 ++++++++++++++------
 2 files changed, 52 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e7082cae/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 961ae32..85f0cf8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,19 +237,13 @@ class Dataset[T] private[sql](
    */
   private[sql] def showString(_numRows: Int, truncate: Boolean = true): String 
= {
     val numRows = _numRows.max(0)
-    val takeResult = take(numRows + 1)
+    val takeResult = toDF().take(numRows + 1)
     val hasMoreData = takeResult.length > numRows
     val data = takeResult.take(numRows)
 
     // For array values, replace Seq and Array with square brackets
     // For cells that are beyond 20 characters, replace it with the first 17 
and "..."
-    val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map {
-      case r: Row => r
-      case tuple: Product => Row.fromTuple(tuple)
-      case definedByCtor: DefinedByConstructorParams =>
-        
Row.fromSeq(ScalaReflection.getConstructorParameterValues(definedByCtor))
-      case o => Row(o)
-    }.map { row =>
+    val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
       row.toSeq.map { cell =>
         val str = cell match {
           case null => "null"

http://git-wip-us.apache.org/repos/asf/spark/blob/e7082cae/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 05de79e..32320a6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -22,9 +22,8 @@ import java.sql.{Date, Timestamp}
 
 import scala.language.postfixOps
 
-import org.scalatest.words.MatcherWords.be
-
 import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
+import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
@@ -217,7 +216,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
     checkDataset(
       ds.filter(_._1 == "b").select(expr("_1").as[String]),
-      ("b"))
+      "b")
   }
 
   test("foreach") {
@@ -436,20 +435,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
     assert(ds.toString == "[_1: int, _2: int]")
   }
 
-  test("showString: Kryo encoder") {
-    implicit val kryoEncoder = Encoders.kryo[KryoData]
-    val ds = Seq(KryoData(1), KryoData(2)).toDS()
-
-    val expectedAnswer = """+-----------+
-                           ||      value|
-                           |+-----------+
-                           ||KryoData(1)|
-                           ||KryoData(2)|
-                           |+-----------+
-                           |""".stripMargin
-    assert(ds.showString(10) === expectedAnswer)
-  }
-
   test("Kryo encoder") {
     implicit val kryoEncoder = Encoders.kryo[KryoData]
     val ds = Seq(KryoData(1), KryoData(2)).toDS()
@@ -677,7 +662,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   }
 
   test("dataset.rdd with generic case class") {
-    val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS
+    val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS()
     val ds2 = ds.map(g => Generic(g.id, g.value))
     assert(ds.rdd.map(r => r.id).count === 2)
     assert(ds2.rdd.map(r => r.id).count === 2)
@@ -731,6 +716,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
     val df = Seq(1 -> 2).toDF("a", "b")
     checkAnswer(df.map(row => row)(RowEncoder(df.schema)).select("b", "a"), 
Row(2, 1))
   }
+
+  private def checkShowString[T](ds: Dataset[T], expected: String): Unit = {
+    val numRows = expected.split("\n").length - 4
+    val actual = ds.showString(numRows, truncate = true)
+
+    if (expected != actual) {
+      fail(
+        "Dataset.showString() gives wrong result:\n\n" + sideBySide(
+          "== Expected ==\n" + expected,
+          "== Actual ==\n" + actual
+        ).mkString("\n")
+      )
+    }
+  }
+
+  test("SPARK-15550 Dataset.show() should show contents of the underlying 
logical plan") {
+    val df = Seq((1, "foo", "extra"), (2, "bar", "extra")).toDF("b", "a", "c")
+    val ds = df.as[ClassData]
+    val expected =
+      """+---+---+-----+
+        ||  b|  a|    c|
+        |+---+---+-----+
+        ||  1|foo|extra|
+        ||  2|bar|extra|
+        |+---+---+-----+
+        |""".stripMargin
+
+    checkShowString(ds, expected)
+  }
+
+  test("SPARK-15550 Dataset.show() should show inner nested products as rows") 
{
+    val ds = Seq(
+      NestedStruct(ClassData("foo", 1)),
+      NestedStruct(ClassData("bar", 2))
+    ).toDS()
+
+    val expected =
+      """+-------+
+        ||      f|
+        |+-------+
+        ||[foo,1]|
+        ||[bar,2]|
+        |+-------+
+        |""".stripMargin
+
+    checkShowString(ds, expected)
+  }
 }
 
 case class Generic[T](id: T, value: Double)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15550][SQL] Dataset.show() should show contents nested products as rows

Reply via email to