[GitHub] spark pull request #15090: [SPARK-17073] [SQL] generate column-level statist...

cloud-fan Thu, 29 Sep 2016 17:00:19 -0700

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15090#discussion_r81259064
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
 ---
    @@ -32,19 +38,84 @@ package org.apache.spark.sql.catalyst.plans.logical
      * @param sizeInBytes Physical size in bytes. For leaf operators this 
defaults to 1, otherwise it
      *                    defaults to the product of children's `sizeInBytes`.
      * @param rowCount Estimated number of rows.
    + * @param colStats Column-level statistics.
      * @param isBroadcastable If true, output is small enough to be used in a 
broadcast join.
      */
     case class Statistics(
         sizeInBytes: BigInt,
         rowCount: Option[BigInt] = None,
    +    colStats: Map[String, ColumnStat] = Map.empty,
         isBroadcastable: Boolean = false) {
    +
       override def toString: String = "Statistics(" + simpleString + ")"
     
       /** Readable string representation for the Statistics. */
       def simpleString: String = {
         Seq(s"sizeInBytes=$sizeInBytes",
           if (rowCount.isDefined) s"rowCount=${rowCount.get}" else "",
           s"isBroadcastable=$isBroadcastable"
    -    ).filter(_.nonEmpty).mkString("", ", ", "")
    +    ).filter(_.nonEmpty).mkString(", ")
    +  }
    +}
    +
    +/**
    + * Statistics for a column.
    + */
    +case class ColumnStat(statRow: InternalRow) {
    +
    +  def forNumeric[T <: AtomicType](dataType: T): NumericColumnStat[T] = {
    +    NumericColumnStat(statRow, dataType)
    +  }
    +  def forString: StringColumnStat = StringColumnStat(statRow)
    +  def forBinary: BinaryColumnStat = BinaryColumnStat(statRow)
    +  def forBoolean: BooleanColumnStat = BooleanColumnStat(statRow)
    +
    +  override def toString: String = {
    +    // use Base64 for encoding
    +    Base64.encodeBase64String(statRow.asInstanceOf[UnsafeRow].getBytes)
       }
     }
    +
    +object ColumnStat {
    +  def apply(dataType: DataType, str: String): ColumnStat = {
    +    // use Base64 for decoding
    +    val bytes = Base64.decodeBase64(str)
    +    val numFields = dataType match {
    +      case BinaryType | BooleanType => 3
    +      case _ => 4
    +    }
    +    val unsafeRow = new UnsafeRow(numFields)
    --- End diff --
    
    ah i see, sorry I read the code wrong



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #15090: [SPARK-17073] [SQL] generate column-level statist...

Reply via email to