spark git commit: [SPARK-12398] Smart truncation of DataFrame / Dataset toString

marmbrus Mon, 21 Dec 2015 12:46:39 -0800

Repository: spark
Updated Branches:
  refs/heads/master 1920d72a1 -> 474eb21a3



[SPARK-12398] Smart truncation of DataFrame / Dataset toString

When a DataFrame or Dataset has a long schema, we should intelligently truncate 
to avoid flooding the screen with unreadable information.
// Standard output
[a: int, b: int]

// Truncate many top level fields
[a: int, b, string ... 10 more fields]

// Truncate long inner structs
[a: struct<a: Int ... 10 more fields>]

Author: Dilip Biswal <dbis...@us.ibm.com>

Closes #10373 from dilipbiswal/spark-12398.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/474eb21a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/474eb21a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/474eb21a

Branch: refs/heads/master
Commit: 474eb21a30f7ee898f76a625a5470c8245af1d22
Parents: 1920d72
Author: Dilip Biswal <dbis...@us.ibm.com>
Authored: Mon Dec 21 12:46:06 2015 -0800
Committer: Michael Armbrust <mich...@databricks.com>
Committed: Mon Dec 21 12:46:06 2015 -0800

----------------------------------------------------------------------
 .../org/apache/spark/sql/types/DataType.scala   |  3 ++
 .../org/apache/spark/sql/types/StructType.scala | 17 +++++++++
 .../apache/spark/sql/execution/Queryable.scala  | 15 +++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala   | 39 ++++++++++++++++++++
 4 files changed, 73 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/474eb21a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 4b54c31..b0c43c4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -66,6 +66,9 @@ abstract class DataType extends AbstractDataType {
   /** Readable string representation for the type. */
   def simpleString: String = typeName
 
+  /** Readable string representation for the type with truncation */
+  private[sql] def simpleString(maxNumberFields: Int): String = simpleString
+
   /**
    * Check if `this` and `other` are the same data type when ignoring 
nullability
    * (`StructField.nullable`, `ArrayType.containsNull`, and 
`MapType.valueContainsNull`).

http://git-wip-us.apache.org/repos/asf/spark/blob/474eb21a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 9778df2..d568022 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -278,6 +278,23 @@ case class StructType(fields: Array[StructField]) extends 
DataType with Seq[Stru
     s"struct<${fieldTypes.mkString(",")}>"
   }
 
+  private[sql] override def simpleString(maxNumberFields: Int): String = {
+    val builder = new StringBuilder
+    val fieldTypes = fields.take(maxNumberFields).map {
+      case f => s"${f.name}: ${f.dataType.simpleString(maxNumberFields)}"
+    }
+    builder.append("struct<")
+    builder.append(fieldTypes.mkString(", "))
+    if (fields.length > 2) {
+      if (fields.length - fieldTypes.size == 1) {
+        builder.append(" ... 1 more field")
+      } else {
+        builder.append(" ... " + (fields.length - 2) + " more fields")
+      }
+    }
+    builder.append(">").toString()
+  }
+
   /**
    * Merges with another schema (`StructType`).  For a struct field A from 
`this` and a struct field
    * B from `that`,

http://git-wip-us.apache.org/repos/asf/spark/blob/474eb21a/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
index b397d42..3f391fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Queryable.scala
@@ -31,7 +31,20 @@ private[sql] trait Queryable {
 
   override def toString: String = {
     try {
-      schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", 
", ", "]")
+      val builder = new StringBuilder
+      val fields = schema.take(2).map {
+        case f => s"${f.name}: ${f.dataType.simpleString(2)}"
+      }
+      builder.append("[")
+      builder.append(fields.mkString(", "))
+      if (schema.length > 2) {
+        if (schema.length - fields.size == 1) {
+          builder.append(" ... 1 more field")
+        } else {
+          builder.append(" ... " + (schema.length - 2) + " more fields")
+        }
+      }
+      builder.append("]").toString()
     } catch {
       case NonFatal(e) =>
         s"Invalid tree; ${e.getMessage}:\n$queryExecution"

http://git-wip-us.apache.org/repos/asf/spark/blob/474eb21a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 4c3e12a..1a0f1b6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1177,4 +1177,43 @@ class DataFrameSuite extends QueryTest with 
SharedSQLContext {
     val primitiveUDF = udf((i: Int) => i * 2)
     checkAnswer(df.select(primitiveUDF($"age")), Row(44) :: Row(null) :: Nil)
   }
+
+  test("SPARK-12398 truncated toString") {
+    val df1 = Seq((1L, "row1")).toDF("id", "name")
+    assert(df1.toString() === "[id: bigint, name: string]")
+
+    val df2 = Seq((1L, "c2", false)).toDF("c1", "c2", "c3")
+    assert(df2.toString === "[c1: bigint, c2: string ... 1 more field]")
+
+    val df3 = Seq((1L, "c2", false, 10)).toDF("c1", "c2", "c3", "c4")
+    assert(df3.toString === "[c1: bigint, c2: string ... 2 more fields]")
+
+    val df4 = Seq((1L, Tuple2(1L, "val"))).toDF("c1", "c2")
+    assert(df4.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string>]")
+
+    val df5 = Seq((1L, Tuple2(1L, "val"), 20.0)).toDF("c1", "c2", "c3")
+    assert(df5.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string> 
... 1 more field]")
+
+    val df6 = Seq((1L, Tuple2(1L, "val"), 20.0, 1)).toDF("c1", "c2", "c3", 
"c4")
+    assert(df6.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string> 
... 2 more fields]")
+
+    val df7 = Seq((1L, Tuple3(1L, "val", 2), 20.0, 1)).toDF("c1", "c2", "c3", 
"c4")
+    assert(
+      df7.toString ===
+        "[c1: bigint, c2: struct<_1: bigint, _2: string ... 1 more field> ... 
2 more fields]")
+
+    val df8 = Seq((1L, Tuple7(1L, "val", 2, 3, 4, 5, 6), 20.0, 1)).toDF("c1", 
"c2", "c3", "c4")
+    assert(
+      df8.toString ===
+        "[c1: bigint, c2: struct<_1: bigint, _2: string ... 5 more fields> ... 
2 more fields]")
+
+    val df9 =
+      Seq((1L, Tuple4(1L, Tuple4(1L, 2L, 3L, 4L), 2L, 3L), 20.0, 
1)).toDF("c1", "c2", "c3", "c4")
+    assert(
+      df9.toString ===
+        "[c1: bigint, c2: struct<_1: bigint," +
+          " _2: struct<_1: bigint," +
+          " _2: bigint ... 2 more fields> ... 2 more fields> ... 2 more 
fields]")
+
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-12398] Smart truncation of DataFrame / Dataset toString

Reply via email to