spark git commit: [SPARK-25850][SQL] Make the split threshold for the code generated function configurable

wenchen Mon, 05 Nov 2018 04:10:49 -0800

Repository: spark
Updated Branches:
  refs/heads/master 4afb35033 -> e017cb396



[SPARK-25850][SQL] Make the split threshold for the code generated function 
configurable

## What changes were proposed in this pull request?
As per the discussion in 
[#22823](https://github.com/apache/spark/pull/22823/files#r228400706), add a 
new configuration to make the split threshold for the code generated function 
configurable.

When the generated Java function source code exceeds 
`spark.sql.codegen.methodSplitThreshold`, it will be split into multiple small 
functions.

## How was this patch tested?
manual tests

Closes #22847 from yucai/splitThreshold.

Authored-by: yucai <y...@ebay.com>
Signed-off-by: Wenchen Fan <wenc...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e017cb39
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e017cb39
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e017cb39

Branch: refs/heads/master
Commit: e017cb39642a5039abd8ce8127ad41712901bdbc
Parents: 4afb350
Author: yucai <y...@ebay.com>
Authored: Mon Nov 5 20:09:39 2018 +0800
Committer: Wenchen Fan <wenc...@databricks.com>
Committed: Mon Nov 5 20:09:39 2018 +0800

----------------------------------------------------------------------
 .../spark/sql/catalyst/expressions/Expression.scala   |  4 +++-
 .../catalyst/expressions/codegen/CodeGenerator.scala  |  3 ++-
 .../scala/org/apache/spark/sql/internal/SQLConf.scala | 14 ++++++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e017cb39/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index ccc5b90..141fcff 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -25,6 +25,7 @@ import 
org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -121,7 +122,8 @@ abstract class Expression extends TreeNode[Expression] {
 
   private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = {
     // TODO: support whole stage codegen too
-    if (eval.code.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == 
null) {
+    val splitThreshold = SQLConf.get.methodSplitThreshold
+    if (eval.code.length > splitThreshold && ctx.INPUT_ROW != null && 
ctx.currentVars == null) {
       val setIsNull = if (!eval.isNull.isInstanceOf[LiteralValue]) {
         val globalIsNull = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, 
"globalIsNull")
         val localIsNull = eval.isNull

http://git-wip-us.apache.org/repos/asf/spark/blob/e017cb39/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index d5857e0..b868a0f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -910,12 +910,13 @@ class CodegenContext {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
     var length = 0
+    val splitThreshold = SQLConf.get.methodSplitThreshold
     for (code <- expressions) {
       // We can't know how many bytecode will be generated, so use the length 
of source code
       // as metric. A method should not go beyond 8K, otherwise it will not be 
JITted, should
       // also not be too small, or it will have many function calls (for wide 
table), see the
       // results in BenchmarkWideTable.
-      if (length > 1024) {
+      if (length > splitThreshold) {
         blocks += blockBuilder.toString()
         blockBuilder.clear()
         length = 0

http://git-wip-us.apache.org/repos/asf/spark/blob/e017cb39/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 535ec51..fa59fa5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -818,6 +818,18 @@ object SQLConf {
     .intConf
     .createWithDefault(65535)
 
+  val CODEGEN_METHOD_SPLIT_THRESHOLD = 
buildConf("spark.sql.codegen.methodSplitThreshold")
+    .internal()
+    .doc("The threshold of source-code splitting in the codegen. When the 
number of characters " +
+      "in a single Java function (without comment) exceeds the threshold, the 
function will be " +
+      "automatically split to multiple smaller ones. We cannot know how many 
bytecode will be " +
+      "generated, so use the code length as metric. When running on HotSpot, a 
function's " +
+      "bytecode should not go beyond 8KB, otherwise it will not be JITted; it 
also should not " +
+      "be too small, otherwise there will be many function calls.")
+    .intConf
+    .checkValue(threshold => threshold > 0, "The threshold must be a positive 
integer.")
+    .createWithDefault(1024)
+
   val WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR =
     buildConf("spark.sql.codegen.splitConsumeFuncByOperator")
       .internal()
@@ -1739,6 +1751,8 @@ class SQLConf extends Serializable with Logging {
 
   def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT)
 
+  def methodSplitThreshold: Int = getConf(CODEGEN_METHOD_SPLIT_THRESHOLD)
+
   def wholeStageSplitConsumeFuncByOperator: Boolean =
     getConf(WHOLESTAGE_SPLIT_CONSUME_FUNC_BY_OPERATOR)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25850][SQL] Make the split threshold for the code generated function configurable

Reply via email to