[GitHub] spark pull request: [WIP][SPARK-2054][SQL] Code Generation for Exp...

rxin Fri, 06 Jun 2014 01:15:08 -0700

Github user rxin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/993#discussion_r13480043
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GeneratedRow.scala
 ---
    @@ -0,0 +1,833 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql
    +package catalyst
    +package expressions
    +
    +import types._
    +
    +object DumpByteCode {
    +  import scala.sys.process._
    +  val dumpDirectory = util.getTempFilePath("sparkSqlByteCode")
    +  dumpDirectory.mkdir()
    +
    +  def apply(obj: Any): Unit = {
    +    val generatedClass = obj.getClass
    +    val classLoader =
    +      generatedClass
    +        .getClassLoader
    +        .asInstanceOf[scala.tools.nsc.interpreter.AbstractFileClassLoader]
    +    val generatedBytes = classLoader.classBytes(generatedClass.getName)
    +
    +    val packageDir = new java.io.File(dumpDirectory, 
generatedClass.getPackage.getName)
    +    if (!packageDir.exists()) { packageDir.mkdir() }
    +
    +    val classFile =
    +      new java.io.File(packageDir, 
generatedClass.getName.split("\\.").last + ".class")
    +
    +    val outfile = new java.io.FileOutputStream(classFile)
    +    outfile.write(generatedBytes)
    +    outfile.close()
    +
    +    println(
    +      s"javap -p -v -classpath ${dumpDirectory.getCanonicalPath} 
${generatedClass.getName}".!!)
    +  }
    +}
    +
    +object CodeGeneration
    +
    +class CodeGenerator extends Logging {
    +  import scala.reflect.runtime.{universe => ru}
    +  import scala.reflect.runtime.universe._
    +
    +  import scala.tools.reflect.ToolBox
    +
    +  val toolBox = runtimeMirror(getClass.getClassLoader).mkToolBox()
    +
    +  // TODO: Use typetags?
    +  val rowType = tq"org.apache.spark.sql.catalyst.expressions.Row"
    +  val mutableRowType = 
tq"org.apache.spark.sql.catalyst.expressions.MutableRow"
    +  val genericRowType = 
tq"org.apache.spark.sql.catalyst.expressions.GenericRow"
    +  val genericMutableRowType = 
tq"org.apache.spark.sql.catalyst.expressions.GenericMutableRow"
    +
    +  val projectionType = 
tq"org.apache.spark.sql.catalyst.expressions.Projection"
    +  val mutableProjectionType = 
tq"org.apache.spark.sql.catalyst.expressions.MutableProjection"
    +
    +  private val curId = new java.util.concurrent.atomic.AtomicInteger()
    +  private val javaSeperator = "$"
    +
    +  /**
    +   * Returns a term name that is unique within this instance of a 
`CodeGenerator`.
    +   *
    +   * (Since we aren't in a macro context we do not seem to have access to 
the built in `freshName`
    +   * function.)
    +   */
    +  protected def freshName(prefix: String): TermName = {
    +    newTermName(s"$prefix$javaSeperator${curId.getAndIncrement}")
    +  }
    +
    +  /**
    +   * Scala ASTs for evaluating an [[Expression]] given a [[Row]] of input.
    +   *
    +   * @param code The sequence of statements required to evaluate the 
expression.
    +   * @param nullTerm A term that holds a boolean value representing 
whether the expression evaluated
    +   *                 to null.
    +   * @param primitiveTerm A term for a possible primitive value of the 
result of the evaluation. Not
    +   *                      valid if `nullTerm` is set to `false`.
    +   * @param objectTerm An possibly boxed version of the result of 
evaluating this expression.
    +   */
    +  protected case class EvaluatedExpression(
    +      code: Seq[Tree],
    +      nullTerm: TermName,
    +      primitiveTerm: TermName,
    +      objectTerm: TermName) {
    +
    +    def withObjectTerm = ???
    +  }
    +
    +  /**
    +   * Given an expression tree returns the code required to determine both 
if the result is NULL
    +   * as well as the code required to compute the value.
    +   */
    +   def expressionEvaluator(e: Expression): EvaluatedExpression = {
    +    val primitiveTerm = freshName("primitiveTerm")
    +    val nullTerm = freshName("nullTerm")
    +    val objectTerm = freshName("objectTerm")
    +
    +    implicit class Evaluate1(e: Expression) {
    +      def castOrNull(f: TermName => Tree, dataType: DataType): Seq[Tree] = 
{
    +        val eval = expressionEvaluator(e)
    +        eval.code ++
    +          q"""
    +          val $nullTerm = ${eval.nullTerm}
    +          val $primitiveTerm =
    +            if($nullTerm)
    +              ${defaultPrimitive(dataType)}
    +            else
    +              ${f(eval.primitiveTerm)}
    +        """.children
    +      }
    +    }
    +
    +    implicit class Evaluate2(expressions: (Expression, Expression)) {
    +
    +      /**
    +       * Short hand for generating binary evaluation code, which depends 
on two sub-evaluations of
    +       * the same type.  If either of the sub-expressions is null, the 
results of this computation
    +       * is assumed to be null.
    +       *
    +       * @param f a function from two primitive term names to a tree that 
evaluates them.
    +       */
    +      def evaluate(f: (TermName, TermName) => Tree): Seq[Tree] =
    +        evaluateAs(expressions._1.dataType)(f)
    +
    +      def evaluateAs(resultType: DataType)(f: (TermName, TermName) => 
Tree): Seq[Tree] = {
    +        require(expressions._1.dataType == expressions._2.dataType,
    +          s"${expressions._1.dataType} != ${expressions._2.dataType}")
    +
    +        val eval1 = expressionEvaluator(expressions._1)
    +        val eval2 = expressionEvaluator(expressions._2)
    +        val resultCode = f(eval1.primitiveTerm, eval2.primitiveTerm)
    +
    +        eval1.code ++ eval2.code ++
    +        q"""
    +          val $nullTerm = ${eval1.nullTerm} || ${eval2.nullTerm}
    +          val $primitiveTerm: ${termForType(resultType)} =
    +            if($nullTerm) {
    +              ${defaultPrimitive(resultType)}
    +            } else {
    +              $resultCode.asInstanceOf[${termForType(resultType)}]
    +            }
    +        """.children : Seq[Tree]
    +      }
    +    }
    +
    +    val inputTuple = newTermName(s"i")
    +
    +    // TODO: Skip generation of null handling code when expression are not 
nullable.
    +    val primitiveEvaluation: PartialFunction[Expression, Seq[Tree]] = {
    +      case b @ BoundReference(ordinal, _) =>
    +        q"""
    +          val $nullTerm: Boolean = $inputTuple.isNullAt($ordinal)
    +          val $primitiveTerm: ${termForType(b.dataType)} =
    +            if($nullTerm)
    +              ${defaultPrimitive(e.dataType)}
    +            else
    +              ${getColumn(inputTuple, b.dataType, ordinal)}
    +         """.children
    +
    +      case expressions.Literal(null, dataType) =>
    --- End diff --
    
    should we consider moving code gen for each expression into the expression 
itself? just like what we did with eval.
    
    Just throwing an idea out there. I can see benefits on both approaches.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] spark pull request: [WIP][SPARK-2054][SQL] Code Generation for Exp...

Reply via email to