Repository: spark Updated Branches: refs/heads/master aa6564f37 -> 14cf61e90
[SPARK-16331][SQL] Reduce code generation time ## What changes were proposed in this pull request? During the code generation, a `LocalRelation` often has a huge `Vector` object as `data`. In the simple example below, a `LocalRelation` has a Vector with 1000000 elements of `UnsafeRow`. ``` val numRows = 1000000 val ds = (1 to numRows).toDS().persist() benchmark.addCase("filter+reduce") { iter => ds.filter(a => (a & 1) == 0).reduce(_ + _) } ``` At `TreeNode.transformChildren`, all elements of the vector is unnecessarily iterated to check whether any children exist in the vector since `Vector` is Traversable. This part significantly increases code generation time. This patch avoids this overhead by checking the number of children before iterating all elements; `LocalRelation` does not have children since it extends `LeafNode`. The performance of the above example ``` without this patch Java HotSpot(TM) 64-Bit Server VM 1.8.0_91-b14 on Mac OS X 10.11.5 Intel(R) Core(TM) i5-5257U CPU 2.70GHz compilationTime: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ filter+reduce 4426 / 4533 0.2 4426.0 1.0X with this patch compilationTime: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ filter+reduce 3117 / 3391 0.3 3116.6 1.0X ``` ## How was this patch tested? using existing unit tests Author: Hiroshi Inoue <inoue...@jp.ibm.com> Closes #14000 from inouehrs/compilation-time-reduction. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14cf61e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14cf61e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14cf61e9 Branch: refs/heads/master Commit: 14cf61e909598d9f6b9c3b920de7299e9bc828e0 Parents: aa6564f Author: Hiroshi Inoue <inoue...@jp.ibm.com> Authored: Thu Jun 30 21:47:44 2016 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Thu Jun 30 21:47:44 2016 -0700 ---------------------------------------------------------------------- .../spark/sql/catalyst/trees/TreeNode.scala | 82 ++++++++++---------- 1 file changed, 43 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/14cf61e9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 072445a..8bce404 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -315,25 +315,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { protected def transformChildren( rule: PartialFunction[BaseType, BaseType], nextOperation: (BaseType, PartialFunction[BaseType, BaseType]) => BaseType): BaseType = { - var changed = false - val newArgs = mapProductIterator { - case arg: TreeNode[_] if containsChild(arg) => - val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) - if (!(newChild fastEquals arg)) { - changed = true - newChild - } else { - arg - } - case Some(arg: TreeNode[_]) if containsChild(arg) => - val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) - if (!(newChild fastEquals arg)) { - changed = true - Some(newChild) - } else { - Some(arg) - } - case m: Map[_, _] => m.mapValues { + if (children.nonEmpty) { + var changed = false + val newArgs = mapProductIterator { case arg: TreeNode[_] if containsChild(arg) => val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { @@ -342,33 +326,53 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { } else { arg } - case other => other - }.view.force // `mapValues` is lazy and we need to force it to materialize - case d: DataType => d // Avoid unpacking Structs - case args: Traversable[_] => args.map { - case arg: TreeNode[_] if containsChild(arg) => + case Some(arg: TreeNode[_]) if containsChild(arg) => val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) if (!(newChild fastEquals arg)) { changed = true - newChild + Some(newChild) } else { - arg + Some(arg) } - case tuple @ (arg1: TreeNode[_], arg2: TreeNode[_]) => - val newChild1 = nextOperation(arg1.asInstanceOf[BaseType], rule) - val newChild2 = nextOperation(arg2.asInstanceOf[BaseType], rule) - if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) { - changed = true - (newChild1, newChild2) - } else { - tuple - } - case other => other + case m: Map[_, _] => m.mapValues { + case arg: TreeNode[_] if containsChild(arg) => + val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) + if (!(newChild fastEquals arg)) { + changed = true + newChild + } else { + arg + } + case other => other + }.view.force // `mapValues` is lazy and we need to force it to materialize + case d: DataType => d // Avoid unpacking Structs + case args: Traversable[_] => args.map { + case arg: TreeNode[_] if containsChild(arg) => + val newChild = nextOperation(arg.asInstanceOf[BaseType], rule) + if (!(newChild fastEquals arg)) { + changed = true + newChild + } else { + arg + } + case tuple@(arg1: TreeNode[_], arg2: TreeNode[_]) => + val newChild1 = nextOperation(arg1.asInstanceOf[BaseType], rule) + val newChild2 = nextOperation(arg2.asInstanceOf[BaseType], rule) + if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) { + changed = true + (newChild1, newChild2) + } else { + tuple + } + case other => other + } + case nonChild: AnyRef => nonChild + case null => null } - case nonChild: AnyRef => nonChild - case null => null + if (changed) makeCopy(newArgs) else this + } else { + this } - if (changed) makeCopy(newArgs) else this } /** --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org