[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-22 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/spark/pull/18002


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-18 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117299631
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-18 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117177275
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStats(va

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-18 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117174531
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117168094
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStats(va

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117168074
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStats(va

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117167259
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117167072
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStats(va

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117153570
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117153480
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117153431
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  gatherValueStat

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117093744
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala
 ---
@@ -33,18 +32,18 @@ class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[StringColumnStats], STRING, createRow(null, 
null, 0))
   testDecimalColumnStats(createRow(null, null, 0))
 
-  def createRow(values: Any*): GenericInternalRow = new 
GenericInternalRow(values.toArray)
+  def createRow(values: Any*): Array[Any] = values.toArray
--- End diff --

I see. Eliminated.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117093783
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  super.gatherNullStats
--- End diff --

Good catch. done.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread kiszk
Github user kiszk commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r117093688
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  super.gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  g

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r116940295
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  super.gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread viirya
Github user viirya commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r116938870
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  super.gatherNullStats
+}
+  }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
+  override def collectedStatistics: Array[Any] = Array[Any](null, null, 
nullCount, count, 0L)
 }
 
-private[columnar] class BooleanColumnStats extends ColumnStats {
+private[columnar] final class BooleanColumnStats extends ColumnStats {
   protected var upper = false
   protected var lower = true
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getBoolean(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BOOLEAN.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Boolean): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BOOLEAN.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ByteColumnStats extends ColumnStats {
+private[columnar] final class ByteColumnStats extends ColumnStats {
   protected var upper = Byte.MinValue
   protected var lower = Byte.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getByte(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += BYTE.defaultSize
+  gatherValueStats(value)
+} else {
+  super.gatherNullStats
 }
   }
 
-  override def collectedStatistics: GenericInternalRow =
-new GenericInternalRow(Array[Any](lower, upper, nullCount, count, 
sizeInBytes))
+  def gatherValueStats(value: Byte): Unit = {
+if (value > upper) upper = value
+if (value < lower) lower = value
+sizeInBytes += BYTE.defaultSize
+count += 1
+  }
+
+  override def collectedStatistics: Array[Any] =
+Array[Any](lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[columnar] class ShortColumnStats extends ColumnStats {
+private[columnar] final class ShortColumnStats extends ColumnStats {
   protected var upper = Short.MinValue
   protected var lower = Short.MaxValue
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-super.gatherStats(row, ordinal)
 if (!row.isNullAt(ordinal)) {
   val value = row.getShort(ordinal)
-  if (value > upper) upper = value
-  if (value < lower) lower = value
-  sizeInBytes += SHORT.defaultSize
+  

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r116934486
  
--- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala
 ---
@@ -33,18 +32,18 @@ class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[StringColumnStats], STRING, createRow(null, 
null, 0))
   testDecimalColumnStats(createRow(null, null, 0))
 
-  def createRow(values: Any*): GenericInternalRow = new 
GenericInternalRow(values.toArray)
+  def createRow(values: Any*): Array[Any] = values.toArray
--- End diff --

do we still need this method?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-17 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/18002#discussion_r116933991
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala
 ---
@@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends 
Serializable {
   /**
* Gathers statistics information from `row(ordinal)`.
*/
-  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
-if (row.isNullAt(ordinal)) {
-  nullCount += 1
-  // 4 bytes for null position
-  sizeInBytes += 4
-}
+  def gatherStats(row: InternalRow, ordinal: Int): Unit
+
+  /**
+   * Gathers statistics information on `null`.
+   */
+  def gatherNullStats(): Unit = {
+nullCount += 1
+// 4 bytes for null position
+sizeInBytes += 4
 count += 1
   }
 
   /**
-   * Column statistics represented as a single row, currently including 
closed lower bound, closed
+   * Column statistics represented as an array, currently including closed 
lower bound, closed
* upper bound and null count.
*/
-  def collectedStatistics: GenericInternalRow
+  def collectedStatistics: Array[Any]
 }
 
 /**
  * A no-op ColumnStats only used for testing purposes.
  */
-private[columnar] class NoopColumnStats extends ColumnStats {
-  override def gatherStats(row: InternalRow, ordinal: Int): Unit = 
super.gatherStats(row, ordinal)
+private[columnar] final class NoopColumnStats extends ColumnStats {
+  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
+if (!row.isNullAt(ordinal)) {
+  count += 1
+} else {
+  super.gatherNullStats
--- End diff --

do we need the `super` keyword here?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

2017-05-16 Thread kiszk
GitHub user kiszk opened a pull request:

https://github.com/apache/spark/pull/18002

[SPARK-20770][SQL] Improve ColumnStats

## What changes were proposed in this pull request?

This PR improves the implementation of `ColumnStats` by using the following 
appoaches.

1. Declare subclasses of `ColumnStats` as `final` 
2. Remove unnecessary call of `row.isNullAt(ordinal)` 
3. Remove the dependency on `GenericInternalRow`

For 1., this declaration encourages method inlining and other optimizations 
of JIT compiler
For 2., in `gatherStats()`, while previous code in subclasses of 
`ColumnStats` always calls `row.isNullAt()` twice, the PR just calls 
`row.isNullAt()` only once.
For 3., `collectedStatistics()` returns `Array[Any]` instead of 
`GenericInternalRow`. This removes the dependency of unnecessary package and 
reduces the number of allocations of `GenericInternalRow`.

In addition to that, in the future, `gatherValueStats()`, which is 
specialized for each data type, can be effectively called from the generated 
code without using generic data structure `InternalRow`.

## How was this patch tested?

Tested by existing test suite

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/kiszk/spark SPARK-20770

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/18002.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #18002






---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org