spark git commit: [SPARK-7993] [SQL] Improved DataFrame.show() output

rxin Fri, 12 Jun 2015 10:41:55 -0700

Repository: spark
Updated Branches:
  refs/heads/master 71cc17bfa -> 19834fa91



[SPARK-7993] [SQL] Improved DataFrame.show() output

Closes #6633

Author: akhilthatipamula <[email protected]>
Author: zsxwing <[email protected]>

Closes #6784 from zsxwing/pr6633 and squashes the following commits:

5da1c51 [zsxwing] Address comments and add unit tests
17eab7b [akhilthatipamula] refactored code
19874b3 [akhilthatipamula] Update DataFrame.scala
0a76a5e [akhilthatipamula] Optimised showString()
e3dd03f [akhilthatipamula] Modified showString() method
a21012b [akhilthatipamula] improved the show()
4bb742f [akhilthatipamula] Modified dataframe.show() method


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19834fa9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19834fa9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19834fa9

Branch: refs/heads/master
Commit: 19834fa9184f0365a160bcb54bcd33eaa87c70dc
Parents: 71cc17b
Author: akhilthatipamula <[email protected]>
Authored: Fri Jun 12 10:40:28 2015 -0700
Committer: Reynold Xin <[email protected]>
Committed: Fri Jun 12 10:40:28 2015 -0700

----------------------------------------------------------------------
 .../scala/org/apache/spark/sql/DataFrame.scala  | 30 +++++++++---
 .../org/apache/spark/sql/DataFrameSuite.scala   | 51 ++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/19834fa9/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 59f64dd..f041fd3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -169,23 +169,34 @@ class DataFrame private[sql](
 
   /**
    * Internal API for Python
-   * @param numRows Number of rows to show
+   * @param _numRows Number of rows to show
    */
-  private[sql] def showString(numRows: Int): String = {
+  private[sql] def showString(_numRows: Int): String = {
+    val numRows = _numRows.max(0)
     val sb = new StringBuilder
-    val data = take(numRows)
+    val takeResult = take(numRows + 1)
+    val hasMoreData = takeResult.length > numRows
+    val data = takeResult.take(numRows)
     val numCols = schema.fieldNames.length
 
+    // For array values, replace Seq and Array with square brackets
     // For cells that are beyond 20 characters, replace it with the first 17 
and "..."
     val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
       row.toSeq.map { cell =>
-        val str = if (cell == null) "null" else cell.toString
+        val str = cell match {
+          case null => "null"
+          case array: Array[_] => array.mkString("[", ", ", "]")
+          case seq: Seq[_] => seq.mkString("[", ", ", "]")
+          case _ => cell.toString
+        }
         if (str.length > 20) str.substring(0, 17) + "..." else str
       }: Seq[String]
     }
 
+    // Initialise the width of each column to a minimum value of '3'
+    val colWidths = Array.fill(numCols)(3)
+
     // Compute the width of each column
-    val colWidths = Array.fill(numCols)(0)
     for (row <- rows) {
       for ((cell, i) <- row.zipWithIndex) {
         colWidths(i) = math.max(colWidths(i), cell.length)
@@ -197,7 +208,7 @@ class DataFrame private[sql](
 
     // column names
     rows.head.zipWithIndex.map { case (cell, i) =>
-      StringUtils.leftPad(cell.toString, colWidths(i))
+      StringUtils.leftPad(cell, colWidths(i))
     }.addString(sb, "|", "|", "|\n")
 
     sb.append(sep)
@@ -210,6 +221,13 @@ class DataFrame private[sql](
     }
 
     sb.append(sep)
+
+    // For Data that has more than "numRows" records
+    if (hasMoreData) {
+      val rowsString = if (numRows == 1) "row" else "rows"
+      sb.append(s"only showing top $numRows ${rowsString}\n")
+    }
+
     sb.toString()
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/19834fa9/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index bb8621a..84835c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -469,12 +469,63 @@ class DataFrameSuite extends QueryTest {
     testData.select($"*").show(1000)
   }
 
+  test("showString(negative)") {
+    val expectedAnswer = """+---+-----+
+                           ||key|value|
+                           |+---+-----+
+                           |+---+-----+
+                           |only showing top 0 rows
+                           |""".stripMargin
+    assert(testData.select($"*").showString(-1) === expectedAnswer)
+  }
+
+  test("showString(0)") {
+    val expectedAnswer = """+---+-----+
+                           ||key|value|
+                           |+---+-----+
+                           |+---+-----+
+                           |only showing top 0 rows
+                           |""".stripMargin
+    assert(testData.select($"*").showString(0) === expectedAnswer)
+  }
+
+  test("showString: array") {
+    val df = Seq(
+      (Array(1, 2, 3), Array(1, 2, 3)),
+      (Array(2, 3, 4), Array(2, 3, 4))
+    ).toDF()
+    val expectedAnswer = """+---------+---------+
+                           ||       _1|       _2|
+                           |+---------+---------+
+                           ||[1, 2, 3]|[1, 2, 3]|
+                           ||[2, 3, 4]|[2, 3, 4]|
+                           |+---------+---------+
+                           |""".stripMargin
+    assert(df.showString(10) === expectedAnswer)
+  }
+
+  test("showString: minimum column width") {
+    val df = Seq(
+      (1, 1),
+      (2, 2)
+    ).toDF()
+    val expectedAnswer = """+---+---+
+                           || _1| _2|
+                           |+---+---+
+                           ||  1|  1|
+                           ||  2|  2|
+                           |+---+---+
+                           |""".stripMargin
+    assert(df.showString(10) === expectedAnswer)
+  }
+
   test("SPARK-7319 showString") {
     val expectedAnswer = """+---+-----+
                            ||key|value|
                            |+---+-----+
                            ||  1|    1|
                            |+---+-----+
+                           |only showing top 1 row
                            |""".stripMargin
     assert(testData.select($"*").showString(1) === expectedAnswer)
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-7993] [SQL] Improved DataFrame.show() output

Reply via email to