[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...

pwoody Sun, 01 Apr 2018 08:14:21 -0700

Github user pwoody commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20935#discussion_r178460763
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
    @@ -322,19 +324,76 @@ private[columnar] final class 
DecimalColumnStats(precision: Int, scale: Int) ext
         Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] final class ObjectColumnStats(dataType: DataType) 
extends ColumnStats {
    -  val columnType = ColumnType(dataType)
    +private abstract class OrderableSafeColumnStats[T](dataType: DataType) 
extends ColumnStats {
    +  protected var upper: T = _
    +  protected var lower: T = _
    +
    +  private val columnType = ColumnType(dataType)
    +  private val ordering = dataType match {
    +    case x if RowOrdering.isOrderable(dataType) =>
    +      Option(TypeUtils.getInterpretedOrdering(x))
    +    case _ => None
    +  }
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
         if (!row.isNullAt(ordinal)) {
    -      val size = columnType.actualSize(row, ordinal)
    -      sizeInBytes += size
    +      sizeInBytes += columnType.actualSize(row, ordinal)
           count += 1
    +      ordering.foreach { order =>
    +        val value = getValue(row, ordinal)
    +        if (upper == null || order.gt(value, upper)) upper = copy(value)
    +        if (lower == null || order.lt(value, lower)) lower = copy(value)
    +      }
         } else {
    -      gatherNullStats
    +      gatherNullStats()
         }
       }
     
    +  def getValue(row: InternalRow, ordinal: Int): T
    +
    +  def copy(value: T): T
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
    +}
    +
    +private[columnar] final class ArrayColumnStats(dataType: DataType)
    +  extends OrderableSafeColumnStats[UnsafeArrayData](dataType) {
    +  override def getValue(row: InternalRow, ordinal: Int): UnsafeArrayData =
    +    row.getArray(ordinal).asInstanceOf[UnsafeArrayData]
    +
    +  override def copy(value: UnsafeArrayData): UnsafeArrayData = value.copy()
    +}
    +
    +private[columnar] final class StructColumnStats(dataType: DataType)
    +  extends OrderableSafeColumnStats[UnsafeRow](dataType) {
    +  private val numFields = dataType.asInstanceOf[StructType].fields.length
    +
    +  override def getValue(row: InternalRow, ordinal: Int): UnsafeRow =
    +    row.getStruct(ordinal, numFields).asInstanceOf[UnsafeRow]
    +
    +  override def copy(value: UnsafeRow): UnsafeRow = value.copy()
    +}
    +
    +private[columnar] final class MapColumnStats(dataType: DataType) extends 
ColumnStats {
    --- End diff --
    
    Now that you mention it - we can just have it use it now since it will 
always fall through to the unorderable case. Everything will just work when we 
make it orderable w/o code change here.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #20935: [SPARK-23819][SQL] Fix InMemoryTableScanExec comp...

Reply via email to