[GitHub] spark pull request: [SPARK-9023] [SQL] [WIP] Efficiency improvemen...

rxin Thu, 16 Jul 2015 23:37:51 -0700

Github user rxin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7456#discussion_r34865261
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala ---
    @@ -112,109 +114,71 @@ case class Exchange(newPartitioning: Partitioning, 
child: SparkPlan) extends Una
     
       @transient private lazy val sparkConf = 
child.sqlContext.sparkContext.getConf
     
    -  private def getSerializer(
    -      keySchema: Array[DataType],
    -      valueSchema: Array[DataType],
    -      numPartitions: Int): Serializer = {
    +  private val rowDataTypes = child.output.map(_.dataType).toArray
    +
    +  private val serializer: Serializer = {
         // It is true when there is no field that needs to be write out.
         // For now, we will not use SparkSqlSerializer2 when noField is true.
    -    val noField =
    -      (keySchema == null || keySchema.length == 0) &&
    -      (valueSchema == null || valueSchema.length == 0)
    +    val noField = rowDataTypes == null || rowDataTypes.length == 0
     
         val useSqlSerializer2 =
             child.sqlContext.conf.useSqlSerializer2 &&   // 
SparkSqlSerializer2 is enabled.
    -        SparkSqlSerializer2.support(keySchema) &&    // The schema of key 
is supported.
    -        SparkSqlSerializer2.support(valueSchema) &&  // The schema of 
value is supported.
    +        SparkSqlSerializer2.support(rowDataTypes) &&  // The schema of row 
is supported.
             !noField
     
    -    val serializer = if (useSqlSerializer2) {
    +    if (child.outputsUnsafeRows) {
    +      logInfo("Using UnsafeRowSerializer.")
    +      new UnsafeRowSerializer(child.output.size)
    +    } else if (useSqlSerializer2) {
           logInfo("Using SparkSqlSerializer2.")
    -      new SparkSqlSerializer2(keySchema, valueSchema)
    +      new SparkSqlSerializer2(rowDataTypes)
         } else {
           logInfo("Using SparkSqlSerializer.")
           new SparkSqlSerializer(sparkConf)
         }
    -
    -    serializer
       }
     
       protected override def doExecute(): RDD[InternalRow] = attachTree(this , 
"execute") {
    -    newPartitioning match {
    -      case HashPartitioning(expressions, numPartitions) =>
    -        val keySchema = expressions.map(_.dataType).toArray
    -        val valueSchema = child.output.map(_.dataType).toArray
    -        val serializer = getSerializer(keySchema, valueSchema, 
numPartitions)
    -        val part = new HashPartitioner(numPartitions)
    -
    -        val rdd = if (needToCopyObjectsBeforeShuffle(part, serializer)) {
    -          child.execute().mapPartitions { iter =>
    -            val hashExpressions = newMutableProjection(expressions, 
child.output)()
    -            iter.map(r => (hashExpressions(r).copy(), r.copy()))
    -          }
    -        } else {
    -          child.execute().mapPartitions { iter =>
    -            val hashExpressions = newMutableProjection(expressions, 
child.output)()
    -            val mutablePair = new MutablePair[InternalRow, InternalRow]()
    -            iter.map(r => mutablePair.update(hashExpressions(r), r))
    -          }
    -        }
    -        val shuffled = new ShuffledRDD[InternalRow, InternalRow, 
InternalRow](rdd, part)
    -        shuffled.setSerializer(serializer)
    -        shuffled.map(_._2)
    -
    +    val rdd = child.execute()
    +    val part: Partitioner = newPartitioning match {
    +      case HashPartitioning(expressions, numPartitions) => new 
HashPartitioner(numPartitions)
           case RangePartitioning(sortingExpressions, numPartitions) =>
    -        val keySchema = child.output.map(_.dataType).toArray
    -        val serializer = getSerializer(keySchema, null, numPartitions)
    -
    -        val childRdd = child.execute()
    -        val part: Partitioner = {
    -          // Internally, RangePartitioner runs a job on the RDD that 
samples keys to compute
    -          // partition bounds. To get accurate samples, we need to copy 
the mutable keys.
    -          val rddForSampling = childRdd.mapPartitions { iter =>
    -            val mutablePair = new MutablePair[InternalRow, Null]()
    -            iter.map(row => mutablePair.update(row.copy(), null))
    -          }
    -          // TODO: RangePartitioner should take an Ordering.
    -          implicit val ordering = new RowOrdering(sortingExpressions, 
child.output)
    -          new RangePartitioner(numPartitions, rddForSampling, ascending = 
true)
    -        }
    -
    -        val rdd = if (needToCopyObjectsBeforeShuffle(part, serializer)) {
    -          childRdd.mapPartitions { iter => iter.map(row => (row.copy(), 
null))}
    -        } else {
    -          childRdd.mapPartitions { iter =>
    -            val mutablePair = new MutablePair[InternalRow, Null]()
    -            iter.map(row => mutablePair.update(row, null))
    -          }
    +        // Internally, RangePartitioner runs a job on the RDD that samples 
keys to compute
    --- End diff --
    
    ouch total order sort is super inefficient - we should definitely try to 
make it a lot more efficient in 1.6.




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-9023] [SQL] [WIP] Efficiency improvemen...

Reply via email to