This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 020f3a3 [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec 020f3a3 is described below commit 020f3a33dd711d05337bb42d5f65708a4aec2daa Author: Peter Toth <peter.t...@gmail.com> AuthorDate: Thu Apr 16 17:52:22 2020 +0900 [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec ### What changes were proposed in this pull request? SPARK-21870 (cb0cddf#diff-06dc5de6163687b7810aa76e7e152a76R146-R149) caused significant performance regression in cases where the source code size is fairly large as `HashAggregateExec` uses `Block.length` to decide on splitting the code. The change in `length` makes sense as the comment and extra new lines shouldn't be taken into account when deciding on splitting, but the regular expression based approach is very slow and adds a big relative overhead to cases where the execution is [...] This PR: - restores `Block.length` to its original form - places comments in `HashAggragateExec` with `CodegenContext.registerComment` so as to appear only when comments are enabled (`spark.sql.codegen.comments=true`) Before this PR: ``` deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ 250 deep x 400 rows (read in-mem) 1137 1143 8 0.1 11368.3 0.0X ``` After this PR: ``` deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ 250 deep x 400 rows (read in-mem) 167 180 7 0.6 1674.3 0.1X ``` ### Why are the changes needed? To fix performance regression. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UTs. Closes #28083 from peter-toth/SPARK-30564-use-comment-placeholders. Authored-by: Peter Toth <peter.t...@gmail.com> Signed-off-by: Takeshi Yamamuro <yamam...@apache.org> (cherry picked from commit 7ad6ba36f28b7a5ca548950dec6afcd61e5d68b9) Signed-off-by: Takeshi Yamamuro <yamam...@apache.org> --- .../spark/sql/catalyst/expressions/codegen/javaCode.scala | 8 ++++---- .../spark/sql/execution/aggregate/HashAggregateExec.scala | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala index dff2589..1c59c3c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala @@ -143,10 +143,10 @@ trait Block extends TreeNode[Block] with JavaCode { case _ => code.trim } - def length: Int = { - // Returns a code length without comments - CodeFormatter.stripExtraNewLinesAndComments(toString).length - } + // We could remove comments, extra whitespaces and newlines when calculating length as it is used + // only for codegen method splitting, but SPARK-30564 showed that this is a performance critical + // function so we decided not to do so. + def length: Int = toString.length def isEmpty: Boolean = toString.isEmpty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index 7a26fd7..87a4081 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -367,10 +367,10 @@ case class HashAggregateExec( """.stripMargin } code""" - |// do aggregate for ${aggNames(i)} - |// evaluate aggregate function + |${ctx.registerComment(s"do aggregate for ${aggNames(i)}")} + |${ctx.registerComment("evaluate aggregate function")} |${evaluateVariables(bufferEvalsForOneFunc)} - |// update aggregation buffers + |${ctx.registerComment("update aggregation buffers")} |${updates.mkString("\n").trim} """.stripMargin } @@ -975,9 +975,9 @@ case class HashAggregateExec( CodeGenerator.updateColumn(unsafeRowBuffer, dt, bufferOffset + j, ev, nullable) } code""" - |// evaluate aggregate function for ${aggNames(i)} + |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")} |${evaluateVariables(rowBufferEvalsForOneFunc)} - |// update unsafe row buffer + |${ctx.registerComment("update unsafe row buffer")} |${updateRowBuffers.mkString("\n").trim} """.stripMargin } @@ -1030,9 +1030,9 @@ case class HashAggregateExec( isVectorized = true) } code""" - |// evaluate aggregate function for ${aggNames(i)} + |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")} |${evaluateVariables(fastRowEvalsForOneFunc)} - |// update fast row + |${ctx.registerComment("update fast row")} |${updateRowBuffer.mkString("\n").trim} """.stripMargin } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org