Github user pwoody commented on a diff in the pull request: https://github.com/apache/spark/pull/20935#discussion_r178460763 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala --- @@ -322,19 +324,76 @@ private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) ext Array[Any](lower, upper, nullCount, count, sizeInBytes) } -private[columnar] final class ObjectColumnStats(dataType: DataType) extends ColumnStats { - val columnType = ColumnType(dataType) +private abstract class OrderableSafeColumnStats[T](dataType: DataType) extends ColumnStats { + protected var upper: T = _ + protected var lower: T = _ + + private val columnType = ColumnType(dataType) + private val ordering = dataType match { + case x if RowOrdering.isOrderable(dataType) => + Option(TypeUtils.getInterpretedOrdering(x)) + case _ => None + } override def gatherStats(row: InternalRow, ordinal: Int): Unit = { if (!row.isNullAt(ordinal)) { - val size = columnType.actualSize(row, ordinal) - sizeInBytes += size + sizeInBytes += columnType.actualSize(row, ordinal) count += 1 + ordering.foreach { order => + val value = getValue(row, ordinal) + if (upper == null || order.gt(value, upper)) upper = copy(value) + if (lower == null || order.lt(value, lower)) lower = copy(value) + } } else { - gatherNullStats + gatherNullStats() } } + def getValue(row: InternalRow, ordinal: Int): T + + def copy(value: T): T + + override def collectedStatistics: Array[Any] = + Array[Any](lower, upper, nullCount, count, sizeInBytes) +} + +private[columnar] final class ArrayColumnStats(dataType: DataType) + extends OrderableSafeColumnStats[UnsafeArrayData](dataType) { + override def getValue(row: InternalRow, ordinal: Int): UnsafeArrayData = + row.getArray(ordinal).asInstanceOf[UnsafeArrayData] + + override def copy(value: UnsafeArrayData): UnsafeArrayData = value.copy() +} + +private[columnar] final class StructColumnStats(dataType: DataType) + extends OrderableSafeColumnStats[UnsafeRow](dataType) { + private val numFields = dataType.asInstanceOf[StructType].fields.length + + override def getValue(row: InternalRow, ordinal: Int): UnsafeRow = + row.getStruct(ordinal, numFields).asInstanceOf[UnsafeRow] + + override def copy(value: UnsafeRow): UnsafeRow = value.copy() +} + +private[columnar] final class MapColumnStats(dataType: DataType) extends ColumnStats { --- End diff -- Now that you mention it - we can just have it use it now since it will always fall through to the unorderable case. Everything will just work when we make it orderable w/o code change here.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org