Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117177275
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including 
closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed 
lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = 
value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = 
value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    Do you want to add the new method `STRING.actualSize(s: UTF8String)`? The 
current signature `actualSize(row: InternalRow, ordinal: Int)` cannot be 
changed since it is declared at the super class.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to