spark git commit: [SPARK-13981][SQL] Defer evaluating variables within Filter operator.

rxin Mon, 28 Mar 2016 20:33:24 -0700

Repository: spark
Updated Branches:
  refs/heads/master 27d4ef0c6 -> 4a55c3363



[SPARK-13981][SQL] Defer evaluating variables within Filter operator.

## What changes were proposed in this pull request?

This improves the Filter codegen for NULLs by deferring loading the values for 
IsNotNull.
Instead of generating code like:

boolean isNull = ...
int value = ...
if (isNull) continue;

we will generate:
boolean isNull = ...
if (isNull) continue;
int value = ...

This is useful since retrieving the values can be non-trivial (they can be 
dictionary encoded
among other things). This currently only works when the attribute comes from 
the column batch
but could be extended to other cases in the future.

## How was this patch tested?

On tpcds q55, this fixes the regression from introducing the IsNotNull 
predicates.

```
TPCDS Snappy:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
--------------------------------------------------------------------------------
q55                                      4564 / 5036         25.2          39.6
q55                                      4064 / 4340         28.3          35.3
```

Author: Nong Li <[email protected]>

Closes #11792 from nongli/spark-13981.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a55c336
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a55c336
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a55c336

Branch: refs/heads/master
Commit: 4a55c336397d3f138c6f5735675ec7cb272827f5
Parents: 27d4ef0
Author: Nong Li <[email protected]>
Authored: Mon Mar 28 20:32:58 2016 -0700
Committer: Reynold Xin <[email protected]>
Committed: Mon Mar 28 20:32:58 2016 -0700

----------------------------------------------------------------------
 .../spark/sql/execution/basicOperators.scala    | 77 ++++++++++++++++----
 1 file changed, 61 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/4a55c336/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 70e04d0..fca6627 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
ExprCode, ExpressionCanonicalizer}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.metric.{LongSQLMetricValue, SQLMetrics}
+import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
 import org.apache.spark.util.random.PoissonSampler
 
@@ -79,16 +79,20 @@ case class Filter(condition: Expression, child: SparkPlan)
 
   // Split out all the IsNotNulls from condition.
   private val (notNullPreds, otherPreds) = 
splitConjunctivePredicates(condition).partition {
-    case IsNotNull(a) if child.output.contains(a) => true
+    case IsNotNull(a) if child.output.exists(_.semanticEquals(a)) => true
     case _ => false
   }
 
   // The columns that will filtered out by `IsNotNull` could be considered as 
not nullable.
   private val notNullAttributes = notNullPreds.flatMap(_.references)
 
+  // Mark this as empty. We'll evaluate the input during doConsume(). We don't 
want to evaluate
+  // all the variables at the beginning to take advantage of short circuiting.
+  override def usedInputs: AttributeSet = AttributeSet.empty
+
   override def output: Seq[Attribute] = {
     child.output.map { a =>
-      if (a.nullable && notNullAttributes.contains(a)) {
+      if (a.nullable && notNullAttributes.exists(_.semanticEquals(a))) {
         a.withNullability(false)
       } else {
         a
@@ -110,39 +114,80 @@ case class Filter(condition: Expression, child: SparkPlan)
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: 
ExprCode): String = {
     val numOutput = metricTerm(ctx, "numOutputRows")
 
-    // filter out the nulls
-    val filterOutNull = notNullAttributes.map { a =>
-      val idx = child.output.indexOf(a)
-      s"if (${input(idx).isNull}) continue;"
-    }.mkString("\n")
+    /**
+     * Generates code for `c`, using `in` for input attributes and `attrs` for 
nullability.
+     */
+    def genPredicate(c: Expression, in: Seq[ExprCode], attrs: Seq[Attribute]): 
String = {
+      val bound = BindReferences.bindReference(c, attrs)
+      val evaluated = evaluateRequiredVariables(child.output, in, c.references)
 
-    ctx.currentVars = input
-    val predicates = otherPreds.map { e =>
-      val bound = ExpressionCanonicalizer.execute(
-        BindReferences.bindReference(e, output))
-      val ev = bound.gen(ctx)
+      // Generate the code for the predicate.
+      val ev = ExpressionCanonicalizer.execute(bound).gen(ctx)
       val nullCheck = if (bound.nullable) {
         s"${ev.isNull} || "
       } else {
         s""
       }
+
       s"""
+         |$evaluated
          |${ev.code}
          |if (${nullCheck}!${ev.value}) continue;
        """.stripMargin
+    }
+
+    ctx.currentVars = input
+
+    // To generate the predicates we will follow this algorithm.
+    // For each predicate that is not IsNotNull, we will generate them one by 
one loading attributes
+    // as necessary. For each of both attributes, if there is a IsNotNull 
predicate we will generate
+    // that check *before* the predicate. After all of these predicates, we 
will generate the
+    // remaining IsNotNull checks that were not part of other predicates.
+    // This has the property of not doing redundant IsNotNull checks and 
taking better advantage of
+    // short-circuiting, not loading attributes until they are needed.
+    // This is very perf sensitive.
+    // TODO: revisit this. We can consider reodering predicates as well.
+    val generatedIsNotNullChecks = new Array[Boolean](notNullPreds.length)
+    val generated = otherPreds.map { c =>
+      val nullChecks = c.references.map { r =>
+        val idx = notNullPreds.indexWhere { n => 
n.asInstanceOf[IsNotNull].child.semanticEquals(r)}
+        if (idx != -1 && !generatedIsNotNullChecks(idx)) {
+          generatedIsNotNullChecks(idx) = true
+          // Use the child's output. The nullability is what the child 
produced.
+          genPredicate(notNullPreds(idx), input, child.output)
+        } else {
+          ""
+        }
+      }.mkString("\n").trim
+
+      // Here we use *this* operator's output with this output's nullability 
since we already
+      // enforced them with the IsNotNull checks above.
+      s"""
+         |$nullChecks
+         |${genPredicate(c, input, output)}
+       """.stripMargin.trim
+    }.mkString("\n")
+
+    val nullChecks = notNullPreds.zipWithIndex.map { case (c, idx) =>
+      if (!generatedIsNotNullChecks(idx)) {
+        genPredicate(c, input, child.output)
+      } else {
+        ""
+      }
     }.mkString("\n")
 
     // Reset the isNull to false for the not-null columns, then the followed 
operators could
     // generate better code (remove dead branches).
     val resultVars = input.zipWithIndex.map { case (ev, i) =>
-      if (notNullAttributes.contains(child.output(i))) {
+      if (notNullAttributes.exists(_.semanticEquals(child.output(i)))) {
         ev.isNull = "false"
       }
       ev
     }
+
     s"""
-       |$filterOutNull
-       |$predicates
+       |$generated
+       |$nullChecks
        |$numOutput.add(1);
        |${consume(ctx, resultVars)}
      """.stripMargin


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-13981][SQL] Defer evaluating variables within Filter operator.

Reply via email to