Repository: spark
Updated Branches:
  refs/heads/master b81e3031f -> 3c614d056


[SPARK-25113][SQL] Add logging to CodeGenerator when any generated method's 
bytecode size goes above HugeMethodLimit

## What changes were proposed in this pull request?

Add logging for all generated methods from the `CodeGenerator` whose bytecode 
size goes above 8000 bytes.
This is to help with gathering stats on how often Spark is generating methods 
too big to be JIT'd. It covers all codegen scenarios, include whole-stage 
codegen and also individual expression codegen, e.g. unsafe projection, mutable 
projection, etc.

## How was this patch tested?

Manually tested that logging did happen when generated method was above 8000 
bytes.
Also added a new unit test case to `CodeGenerationSuite` to verify that the 
logging did happen.

Author: Kris Mok <kris....@databricks.com>

Closes #22103 from rednaxelafx/codegen-8k-logging.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c614d05
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c614d05
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c614d05

Branch: refs/heads/master
Commit: 3c614d0565a9652a12970dcdf8545432a4ac6f68
Parents: b81e303
Author: Kris Mok <kris....@databricks.com>
Authored: Tue Aug 14 16:40:00 2018 -0700
Committer: Xiao Li <gatorsm...@gmail.com>
Committed: Tue Aug 14 16:40:00 2018 -0700

----------------------------------------------------------------------
 .../expressions/codegen/CodeGenerator.scala     | 11 +++-
 .../expressions/CodeGenerationSuite.scala       | 64 ++++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3c614d05/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 4b30de5..e2d74da 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1258,7 +1258,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: 
AnyRef] extends Loggin
 
 object CodeGenerator extends Logging {
 
-  // This is the value of HugeMethodLimit in the OpenJDK JVM settings
+  // This is the default value of HugeMethodLimit in the OpenJDK HotSpot JVM,
+  // beyond which methods will be rejected from JIT compilation
   final val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
 
   // The max valid length of method parameters in JVM.
@@ -1385,9 +1386,15 @@ object CodeGenerator extends Logging {
       try {
         val cf = new ClassFile(new ByteArrayInputStream(classBytes))
         val stats = cf.methodInfos.asScala.flatMap { method =>
-          method.getAttributes().filter(_.getClass.getName == 
codeAttr.getName).map { a =>
+          method.getAttributes().filter(_.getClass eq codeAttr).map { a =>
             val byteCodeSize = 
codeAttrField.get(a).asInstanceOf[Array[Byte]].length
             
CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(byteCodeSize)
+
+            if (byteCodeSize > DEFAULT_JVM_HUGE_METHOD_LIMIT) {
+              logInfo("Generated method too long to be JIT compiled: " +
+                s"${cf.getThisClassName}.${method.getName} is $byteCodeSize 
bytes")
+            }
+
             byteCodeSize
           }
         }

http://git-wip-us.apache.org/repos/asf/spark/blob/3c614d05/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 5b71bec..c383eec 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -19,12 +19,16 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.Timestamp
 
+import org.apache.log4j.{Appender, AppenderSkeleton, Logger}
+import org.apache.log4j.spi.LoggingEvent
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.metrics.source.CodegenMetrics
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils}
 import org.apache.spark.sql.types._
@@ -499,4 +503,64 @@ class CodeGenerationSuite extends SparkFunSuite with 
ExpressionEvalHelper {
       ctx.freshName("a_1") :: ctx.freshName("a_0") :: Nil
     assert(names2.distinct.length == 4)
   }
+
+  test("SPARK-25113: should log when there exists generated methods above 
HugeMethodLimit") {
+    class MockAppender extends AppenderSkeleton {
+      var seenMessage = false
+
+      override def append(loggingEvent: LoggingEvent): Unit = {
+        if (loggingEvent.getRenderedMessage().contains("Generated method too 
long")) {
+          seenMessage = true
+        }
+      }
+
+      override def close(): Unit = {}
+      override def requiresLayout(): Boolean = false
+    }
+
+    val appender = new MockAppender()
+    withLogAppender(appender) {
+      val x = 42
+      val expr = HugeCodeIntExpression(x)
+      val proj = GenerateUnsafeProjection.generate(Seq(expr))
+      val actual = proj(null)
+      assert(actual.getInt(0) == x)
+    }
+    assert(appender.seenMessage)
+  }
+
+  private def withLogAppender(appender: Appender)(f: => Unit): Unit = {
+    val logger =
+      Logger.getLogger(classOf[CodeGenerator[_, _]].getName)
+    logger.addAppender(appender)
+    try f finally {
+      logger.removeAppender(appender)
+    }
+  }
+}
+
+case class HugeCodeIntExpression(value: Int) extends Expression {
+  override def nullable: Boolean = true
+  override def dataType: DataType = IntegerType
+  override def children: Seq[Expression] = Nil
+  override def eval(input: InternalRow): Any = value
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    // Assuming HugeMethodLimit to be 8000
+    val HugeMethodLimit = CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT
+    // A single "int dummyN = 0;" will be at least 2 bytes of bytecode:
+    //   0: iconst_0
+    //   1: istore_1
+    // and it'll become bigger as the number of local variables increases.
+    // So 4000 such dummy local variable definitions are sufficient to bump 
the bytecode size
+    // of a generated method to above 8000 bytes.
+    val hugeCode = (0 until (HugeMethodLimit / 2)).map(i => s"int dummy$i = 
0;").mkString("\n")
+    val code =
+      code"""{
+         |  $hugeCode
+         |}
+         |boolean ${ev.isNull} = false;
+         |int ${ev.value} = $value;
+       """.stripMargin
+    ev.copy(code = code)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to