Repository: spark Updated Branches: refs/heads/master 7deef7a49 -> a28146568
[SPARK-25429][SQL] Use Set instead of Array to improve lookup performance ## What changes were proposed in this pull request? Use `Set` instead of `Array` to improve `accumulatorIds.contains(acc.id)` performance. This PR close https://github.com/apache/spark/pull/22420 ## How was this patch tested? manual tests. Benchmark code: ```scala def benchmark(func: () => Unit): Long = { val start = System.currentTimeMillis() func() val end = System.currentTimeMillis() end - start } val range = Range(1, 1000000) val set = range.toSet val array = range.toArray for (i <- 0 until 5) { val setExecutionTime = benchmark(() => for (i <- 0 until 500) { set.contains(scala.util.Random.nextInt()) }) val arrayExecutionTime = benchmark(() => for (i <- 0 until 500) { array.contains(scala.util.Random.nextInt()) }) println(s"set execution time: $setExecutionTime, array execution time: $arrayExecutionTime") } ``` Benchmark result: ``` set execution time: 4, array execution time: 2760 set execution time: 1, array execution time: 1911 set execution time: 3, array execution time: 2043 set execution time: 12, array execution time: 2214 set execution time: 6, array execution time: 1770 ``` Closes #22579 from wangyum/SPARK-25429. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: gatorsmile <gatorsm...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2814656 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2814656 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2814656 Branch: refs/heads/master Commit: a281465686e8099bb2c0fa4f2ef4822b6e634269 Parents: 7deef7a Author: Yuming Wang <yumw...@ebay.com> Authored: Fri Sep 28 15:08:15 2018 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Fri Sep 28 15:08:15 2018 -0700 ---------------------------------------------------------------------- .../apache/spark/sql/execution/ui/SQLAppStatusListener.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a2814656/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index d254af4..1199eec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -81,9 +81,9 @@ class SQLAppStatusListener( // Record the accumulator IDs for the stages of this job, so that the code that keeps // track of the metrics knows which accumulators to look at. - val accumIds = exec.metrics.map(_.accumulatorId).sorted.toList + val accumIds = exec.metrics.map(_.accumulatorId).toSet event.stageIds.foreach { id => - stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds.toArray, new ConcurrentHashMap())) + stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds, new ConcurrentHashMap())) } exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING) @@ -382,7 +382,7 @@ private class LiveExecutionData(val executionId: Long) extends LiveEntity { private class LiveStageMetrics( val stageId: Int, var attemptId: Int, - val accumulatorIds: Array[Long], + val accumulatorIds: Set[Long], val taskMetrics: ConcurrentHashMap[Long, LiveTaskMetrics]) private class LiveTaskMetrics( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org