Repository: spark
Updated Branches:
refs/heads/master aa6564f37 -> 14cf61e90
[SPARK-16331][SQL] Reduce code generation time
## What changes were proposed in this pull request?
During the code generation, a `LocalRelation` often has a huge `Vector` object
as `data`. In the simple example below, a `LocalRelation` has a Vector with
1000000 elements of `UnsafeRow`.
```
val numRows = 1000000
val ds = (1 to numRows).toDS().persist()
benchmark.addCase("filter+reduce") { iter =>
ds.filter(a => (a & 1) == 0).reduce(_ + _)
}
```
At `TreeNode.transformChildren`, all elements of the vector is unnecessarily
iterated to check whether any children exist in the vector since `Vector` is
Traversable. This part significantly increases code generation time.
This patch avoids this overhead by checking the number of children before
iterating all elements; `LocalRelation` does not have children since it extends
`LeafNode`.
The performance of the above example
```
without this patch
Java HotSpot(TM) 64-Bit Server VM 1.8.0_91-b14 on Mac OS X 10.11.5
Intel(R) Core(TM) i5-5257U CPU 2.70GHz
compilationTime: Best/Avg Time(ms) Rate(M/s) Per
Row(ns) Relative
------------------------------------------------------------------------------------------------
filter+reduce 4426 / 4533 0.2
4426.0 1.0X
with this patch
compilationTime: Best/Avg Time(ms) Rate(M/s) Per
Row(ns) Relative
------------------------------------------------------------------------------------------------
filter+reduce 3117 / 3391 0.3
3116.6 1.0X
```
## How was this patch tested?
using existing unit tests
Author: Hiroshi Inoue <[email protected]>
Closes #14000 from inouehrs/compilation-time-reduction.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14cf61e9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14cf61e9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14cf61e9
Branch: refs/heads/master
Commit: 14cf61e909598d9f6b9c3b920de7299e9bc828e0
Parents: aa6564f
Author: Hiroshi Inoue <[email protected]>
Authored: Thu Jun 30 21:47:44 2016 -0700
Committer: Reynold Xin <[email protected]>
Committed: Thu Jun 30 21:47:44 2016 -0700
----------------------------------------------------------------------
.../spark/sql/catalyst/trees/TreeNode.scala | 82 ++++++++++----------
1 file changed, 43 insertions(+), 39 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/14cf61e9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
----------------------------------------------------------------------
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 072445a..8bce404 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -315,25 +315,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]]
extends Product {
protected def transformChildren(
rule: PartialFunction[BaseType, BaseType],
nextOperation: (BaseType, PartialFunction[BaseType, BaseType]) =>
BaseType): BaseType = {
- var changed = false
- val newArgs = mapProductIterator {
- case arg: TreeNode[_] if containsChild(arg) =>
- val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
- if (!(newChild fastEquals arg)) {
- changed = true
- newChild
- } else {
- arg
- }
- case Some(arg: TreeNode[_]) if containsChild(arg) =>
- val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
- if (!(newChild fastEquals arg)) {
- changed = true
- Some(newChild)
- } else {
- Some(arg)
- }
- case m: Map[_, _] => m.mapValues {
+ if (children.nonEmpty) {
+ var changed = false
+ val newArgs = mapProductIterator {
case arg: TreeNode[_] if containsChild(arg) =>
val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
if (!(newChild fastEquals arg)) {
@@ -342,33 +326,53 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]]
extends Product {
} else {
arg
}
- case other => other
- }.view.force // `mapValues` is lazy and we need to force it to
materialize
- case d: DataType => d // Avoid unpacking Structs
- case args: Traversable[_] => args.map {
- case arg: TreeNode[_] if containsChild(arg) =>
+ case Some(arg: TreeNode[_]) if containsChild(arg) =>
val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
if (!(newChild fastEquals arg)) {
changed = true
- newChild
+ Some(newChild)
} else {
- arg
+ Some(arg)
}
- case tuple @ (arg1: TreeNode[_], arg2: TreeNode[_]) =>
- val newChild1 = nextOperation(arg1.asInstanceOf[BaseType], rule)
- val newChild2 = nextOperation(arg2.asInstanceOf[BaseType], rule)
- if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) {
- changed = true
- (newChild1, newChild2)
- } else {
- tuple
- }
- case other => other
+ case m: Map[_, _] => m.mapValues {
+ case arg: TreeNode[_] if containsChild(arg) =>
+ val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+ if (!(newChild fastEquals arg)) {
+ changed = true
+ newChild
+ } else {
+ arg
+ }
+ case other => other
+ }.view.force // `mapValues` is lazy and we need to force it to
materialize
+ case d: DataType => d // Avoid unpacking Structs
+ case args: Traversable[_] => args.map {
+ case arg: TreeNode[_] if containsChild(arg) =>
+ val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+ if (!(newChild fastEquals arg)) {
+ changed = true
+ newChild
+ } else {
+ arg
+ }
+ case tuple@(arg1: TreeNode[_], arg2: TreeNode[_]) =>
+ val newChild1 = nextOperation(arg1.asInstanceOf[BaseType], rule)
+ val newChild2 = nextOperation(arg2.asInstanceOf[BaseType], rule)
+ if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) {
+ changed = true
+ (newChild1, newChild2)
+ } else {
+ tuple
+ }
+ case other => other
+ }
+ case nonChild: AnyRef => nonChild
+ case null => null
}
- case nonChild: AnyRef => nonChild
- case null => null
+ if (changed) makeCopy(newArgs) else this
+ } else {
+ this
}
- if (changed) makeCopy(newArgs) else this
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]