spark git commit: [SPARK-21196] Split codegen info of query plan into sequence

wenchen Tue, 27 Jun 2017 02:25:33 -0700

Repository: spark
Updated Branches:
  refs/heads/master c22810004 -> 3cb3ccce1



[SPARK-21196] Split codegen info of query plan into sequence

codegen info of query plan can be very long.
In debugging console / web page, it would be more readable if the subtrees and 
corresponding codegen are split into sequence.

Example:

```java
codegenStringSeq(sql("select 1").queryExecution.executedPlan)
```
The example will return Seq[(String, String)] of length 1, containing the 
subtree as string and the corresponding generated code.

The subtree as string:

> (*Project [1 AS 1#0]
> +- Scan OneRowRelation[]

The generated code:
```java
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends 
org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */   private Object[] references;
/* 007 */   private scala.collection.Iterator[] inputs;
/* 008 */   private scala.collection.Iterator inputadapter_input;
/* 009 */   private UnsafeRow project_result;
/* 010 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder project_holder;
/* 011 */   private 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter 
project_rowWriter;
/* 012 */
/* 013 */   public GeneratedIterator(Object[] references) {
/* 014 */     this.references = references;
/* 015 */   }
/* 016 */
/* 017 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 018 */     partitionIndex = index;
/* 019 */     this.inputs = inputs;
/* 020 */     inputadapter_input = inputs[0];
/* 021 */     project_result = new UnsafeRow(1);
/* 022 */     project_holder = new 
org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(project_result, 
0);
/* 023 */     project_rowWriter = new 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(project_holder,
 1);
/* 024 */
/* 025 */   }
/* 026 */
/* 027 */   protected void processNext() throws java.io.IOException {
/* 028 */     while (inputadapter_input.hasNext() && !stopEarly()) {
/* 029 */       InternalRow inputadapter_row = (InternalRow) 
inputadapter_input.next();
/* 030 */       project_rowWriter.write(0, 1);
/* 031 */       append(project_result);
/* 032 */       if (shouldStop()) return;
/* 033 */     }
/* 034 */   }
/* 035 */
/* 036 */ }
```
## What changes were proposed in this pull request?
add method codegenToSeq: split codegen info of query plan into sequence

## How was this patch tested?
unit test

cloud-fan gatorsmile
Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Wang Gengliang <[email protected]>

Closes #18409 from gengliangwang/codegen.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3cb3ccce
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3cb3ccce
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3cb3ccce

Branch: refs/heads/master
Commit: 3cb3ccce120fa9f0273133912624b877b42d95fd
Parents: c228100
Author: Wang Gengliang <[email protected]>
Authored: Tue Jun 27 17:24:46 2017 +0800
Committer: Wenchen Fan <[email protected]>
Committed: Tue Jun 27 17:24:46 2017 +0800

----------------------------------------------------------------------
 .../spark/sql/execution/QueryExecution.scala    |  9 +++++
 .../spark/sql/execution/debug/package.scala     | 35 +++++++++++++++-----
 .../sql/execution/debug/DebuggingSuite.scala    |  7 ++++
 3 files changed, 43 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3cb3ccce/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index c7cac33..9533144 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -245,5 +245,14 @@ class QueryExecution(val sparkSession: SparkSession, val 
logical: LogicalPlan) {
       println(org.apache.spark.sql.execution.debug.codegenString(executedPlan))
       // scalastyle:on println
     }
+
+    /**
+     * Get WholeStageCodegenExec subtrees and the codegen in a query plan
+     *
+     * @return Sequence of WholeStageCodegen subtrees and corresponding codegen
+     */
+    def codegenToSeq(): Seq[(String, String)] = {
+      org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan)
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/3cb3ccce/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 0395c43..a717cbd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -50,7 +50,31 @@ package object debug {
     // scalastyle:on println
   }
 
+  /**
+   * Get WholeStageCodegenExec subtrees and the codegen in a query plan into 
one String
+   *
+   * @param plan the query plan for codegen
+   * @return single String containing all WholeStageCodegen subtrees and 
corresponding codegen
+   */
   def codegenString(plan: SparkPlan): String = {
+    val codegenSeq = codegenStringSeq(plan)
+    var output = s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n"
+    for (((subtree, code), i) <- codegenSeq.zipWithIndex) {
+      output += s"== Subtree ${i + 1} / ${codegenSeq.size} ==\n"
+      output += subtree
+      output += "\nGenerated code:\n"
+      output += s"${code}\n"
+    }
+    output
+  }
+
+  /**
+   * Get WholeStageCodegenExec subtrees and the codegen in a query plan
+   *
+   * @param plan the query plan for codegen
+   * @return Sequence of WholeStageCodegen subtrees and corresponding codegen
+   */
+  def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = {
     val codegenSubtrees = new 
collection.mutable.HashSet[WholeStageCodegenExec]()
     plan transform {
       case s: WholeStageCodegenExec =>
@@ -58,15 +82,10 @@ package object debug {
         s
       case s => s
     }
-    var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
-    for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
-      output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
-      output += s
-      output += "\nGenerated code:\n"
-      val (_, source) = s.doCodeGen()
-      output += s"${CodeFormatter.format(source)}\n"
+    codegenSubtrees.toSeq.map { subtree =>
+      val (_, source) = subtree.doCodeGen()
+      (subtree.toString, CodeFormatter.format(source))
     }
-    output
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/3cb3ccce/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
index 4fc52c9..adcaf2d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
@@ -38,4 +38,11 @@ class DebuggingSuite extends SparkFunSuite with 
SharedSQLContext {
     assert(res.contains("Subtree 2 / 2"))
     assert(res.contains("Object[]"))
   }
+
+  test("debugCodegenStringSeq") {
+    val res = 
codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan)
+    assert(res.length == 2)
+    assert(res.forall{ case (subtree, code) =>
+      subtree.contains("Range") && code.contains("Object[]")})
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-21196] Split codegen info of query plan into sequence

Reply via email to