[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-14 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/spark/pull/15360


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-14 Thread wzhfy
Github user wzhfy commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r83369611
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  val stats1 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats1.sizeInBytes > 0)
+  assert(stats1.rowCount.contains(1))
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  catalog.lookupRelation(tableIndent)
+  val stats2 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats2.sizeInBytes > stats1.sizeInBytes)
+  assert(stats2.rowCount.contains(2))
+}
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
 import testImplicits._
 
 val intSeq = Seq(1, 2)
 val stringSeq = Seq("a", "bb")
+val binarySeq = Seq("a", "bb").map(_.getBytes)
 val booleanSeq = Seq(true, false)
-
 val data = intSeq.indices.map { i =>
-  (intSeq(i), stringSeq(i), booleanSeq(i))
+  (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
 }
-val tableName = "table"
-withTable(tableName) {
-  val df = data.toDF("c1", "c2", "c3")
-  df.write.format("parquet").saveAsTable(tableName)
-  val expectedColStatsSeq = df.schema.map { f =>
-val colStat = f.dataType match {
-  case IntegerType =>
-ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
-  case StringType =>
-ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
-  stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
-  case BooleanType =>
-ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
-  booleanSeq.count(_.equals(false)).toLong))
-}
-(f, colStat)
+val df = data.toDF("c1", "c2", "c3", "c4")
+val expectedColStatsSeq = df.schema.map { f =>
+  val colStat = f.dataType match {
+case IntegerType =>
+  ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
+case StringType =>
+  ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
+stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
+case BinaryType =>
+  ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / 
binarySeq.length.toDouble,
+binarySeq.map(_.length).max.toLong))
+case BooleanType =>
+  ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
+booleanSeq.count(_.equals(false)).toLong))
   }
+  (f, colStat)
+}
+(df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+  tableName: String,
+  isDataSourceTable: Boolean,
+  expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+val readback = spark.table(tableName)
+val stats = readback.queryExecution.analyzed.collect {
+  case rel: MetastoreRelation =>
+assert(!isDataSourceTable, "Expected a Hive serde table, but got a 
data source table")
+rel.catalogTable.stats.get
+  case rel: LogicalRelation =>
+assert(isDataSourceTable, "Expected a data source table, but got a 
Hive serde table")
+rel.catalogTable.get.stats.get
+}
+assert(stats.length == 1)
+val columnStats = stats.head.colStats
+assert(columnStats.size == expectedColStatsSeq.length)
+expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+  StatisticsTest.checkColStat(
+dataType = field.dataType,
+

[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-14 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r83368421
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,53 +358,189 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): 
(Statistics, Statistics) = {
+val tableName = "tbl"
+var statsBeforeUpdate: Statistics = null
+var statsAfterUpdate: Statistics = null
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  statsBeforeUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  catalog.lookupRelation(tableIndent)
+  statsAfterUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+}
+(statsBeforeUpdate, statsAfterUpdate)
+  }
+
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val (statsBeforeUpdate, statsAfterUpdate) = 
getStatsBeforeAfterUpdate(isAnalyzeColumns = false)
+
+assert(statsBeforeUpdate.sizeInBytes > 0)
+assert(statsBeforeUpdate.rowCount.contains(1))
+
+assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+assert(statsAfterUpdate.rowCount.contains(2))
+  }
+
+  test("test refreshing column stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val (statsBeforeUpdate, statsAfterUpdate) = 
getStatsBeforeAfterUpdate(isAnalyzeColumns = true)
+
+assert(statsBeforeUpdate.sizeInBytes > 0)
+assert(statsBeforeUpdate.rowCount.contains(1))
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = statsBeforeUpdate.colStats("key"),
+  expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+
+assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+assert(statsAfterUpdate.rowCount.contains(2))
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = statsAfterUpdate.colStats("key"),
+  expectedColStat = ColumnStat(InternalRow(0L, 2, 1, 2L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
--- End diff --

then can we
```
private lazy val (testDataFrame, expectedStats) = {
  ...
}
```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-14 Thread wzhfy
Github user wzhfy commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r83368215
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,53 +358,189 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): 
(Statistics, Statistics) = {
+val tableName = "tbl"
+var statsBeforeUpdate: Statistics = null
+var statsAfterUpdate: Statistics = null
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  statsBeforeUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  catalog.lookupRelation(tableIndent)
+  statsAfterUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+}
+(statsBeforeUpdate, statsAfterUpdate)
+  }
+
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val (statsBeforeUpdate, statsAfterUpdate) = 
getStatsBeforeAfterUpdate(isAnalyzeColumns = false)
+
+assert(statsBeforeUpdate.sizeInBytes > 0)
+assert(statsBeforeUpdate.rowCount.contains(1))
+
+assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+assert(statsAfterUpdate.rowCount.contains(2))
+  }
+
+  test("test refreshing column stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val (statsBeforeUpdate, statsAfterUpdate) = 
getStatsBeforeAfterUpdate(isAnalyzeColumns = true)
+
+assert(statsBeforeUpdate.sizeInBytes > 0)
+assert(statsBeforeUpdate.rowCount.contains(1))
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = statsBeforeUpdate.colStats("key"),
+  expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+
+assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
+assert(statsAfterUpdate.rowCount.contains(2))
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = statsAfterUpdate.colStats("key"),
+  expectedColStat = ColumnStat(InternalRow(0L, 2, 1, 2L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
--- End diff --

They share some common values e.g. intSeq, stringSeq... so I put them in a 
single method.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-13 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r83366673
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  val stats1 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats1.sizeInBytes > 0)
+  assert(stats1.rowCount.contains(1))
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  catalog.lookupRelation(tableIndent)
+  val stats2 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats2.sizeInBytes > stats1.sizeInBytes)
+  assert(stats2.rowCount.contains(2))
+}
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
 import testImplicits._
 
 val intSeq = Seq(1, 2)
 val stringSeq = Seq("a", "bb")
+val binarySeq = Seq("a", "bb").map(_.getBytes)
 val booleanSeq = Seq(true, false)
-
 val data = intSeq.indices.map { i =>
-  (intSeq(i), stringSeq(i), booleanSeq(i))
+  (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
 }
-val tableName = "table"
-withTable(tableName) {
-  val df = data.toDF("c1", "c2", "c3")
-  df.write.format("parquet").saveAsTable(tableName)
-  val expectedColStatsSeq = df.schema.map { f =>
-val colStat = f.dataType match {
-  case IntegerType =>
-ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
-  case StringType =>
-ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
-  stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
-  case BooleanType =>
-ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
-  booleanSeq.count(_.equals(false)).toLong))
-}
-(f, colStat)
+val df = data.toDF("c1", "c2", "c3", "c4")
+val expectedColStatsSeq = df.schema.map { f =>
+  val colStat = f.dataType match {
+case IntegerType =>
+  ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
+case StringType =>
+  ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
+stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
+case BinaryType =>
+  ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / 
binarySeq.length.toDouble,
+binarySeq.map(_.length).max.toLong))
+case BooleanType =>
+  ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
+booleanSeq.count(_.equals(false)).toLong))
   }
+  (f, colStat)
+}
+(df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+  tableName: String,
+  isDataSourceTable: Boolean,
+  expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+val readback = spark.table(tableName)
+val stats = readback.queryExecution.analyzed.collect {
+  case rel: MetastoreRelation =>
+assert(!isDataSourceTable, "Expected a Hive serde table, but got a 
data source table")
+rel.catalogTable.stats.get
+  case rel: LogicalRelation =>
+assert(isDataSourceTable, "Expected a data source table, but got a 
Hive serde table")
+rel.catalogTable.get.stats.get
+}
+assert(stats.length == 1)
+val columnStats = stats.head.colStats
+assert(columnStats.size == expectedColStatsSeq.length)
+expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+  StatisticsTest.checkColStat(
+dataType = field.dataType,
 

[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-13 Thread cloud-fan
Github user cloud-fan commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r83365814
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,53 +358,189 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): 
(Statistics, Statistics) = {
+val tableName = "tbl"
+var statsBeforeUpdate: Statistics = null
+var statsAfterUpdate: Statistics = null
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  statsBeforeUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  if (isAnalyzeColumns) {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  } else {
+sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  }
+  catalog.lookupRelation(tableIndent)
+  statsAfterUpdate = catalog.getCachedDataSourceTable(tableIndent)
+.asInstanceOf[LogicalRelation].catalogTable.get.stats.get
+}
+(statsBeforeUpdate, statsAfterUpdate)
+  }
+
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val (statsBeforeUpdate, statsAfterUpdate) = 
getStatsBeforeAfterUpdate(isAnalyzeColumns = false)
+
+assert(statsBeforeUpdate.sizeInBytes > 0)
+assert(statsBeforeUpdate.rowCount.contains(1))
--- End diff --

nit: we should not use `Option` as a collection, but use it more explicitly 
`statsBeforeUpdate.rowCount == Some(1)` . BTW `Option.contains` is not in scala 
2.10


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-12 Thread wzhfy
Github user wzhfy commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82956403
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,53 +358,189 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  private def statsBeforeAfterUpdate(isAnalyzeTable: Boolean): 
(Statistics, Statistics) = {
--- End diff --

@gatorsmile OK


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-12 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82947471
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,53 +358,189 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  private def statsBeforeAfterUpdate(isAnalyzeTable: Boolean): 
(Statistics, Statistics) = {
--- End diff --

`Analyze Table COMPUTE STATISTICS FOR COLUMNS` is also `Analyze Table`. 
Thus, the input parm name is confusing. How about `isAnalyzeTable` -> 
`isAnalyzeColumns`?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-11 Thread wzhfy
Github user wzhfy commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82923593
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  val stats1 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats1.sizeInBytes > 0)
+  assert(stats1.rowCount.contains(1))
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  catalog.lookupRelation(tableIndent)
+  val stats2 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats2.sizeInBytes > stats1.sizeInBytes)
+  assert(stats2.rowCount.contains(2))
+}
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
 import testImplicits._
 
 val intSeq = Seq(1, 2)
 val stringSeq = Seq("a", "bb")
+val binarySeq = Seq("a", "bb").map(_.getBytes)
 val booleanSeq = Seq(true, false)
-
 val data = intSeq.indices.map { i =>
-  (intSeq(i), stringSeq(i), booleanSeq(i))
+  (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
 }
-val tableName = "table"
-withTable(tableName) {
-  val df = data.toDF("c1", "c2", "c3")
-  df.write.format("parquet").saveAsTable(tableName)
-  val expectedColStatsSeq = df.schema.map { f =>
-val colStat = f.dataType match {
-  case IntegerType =>
-ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
-  case StringType =>
-ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
-  stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
-  case BooleanType =>
-ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
-  booleanSeq.count(_.equals(false)).toLong))
-}
-(f, colStat)
+val df = data.toDF("c1", "c2", "c3", "c4")
+val expectedColStatsSeq = df.schema.map { f =>
+  val colStat = f.dataType match {
+case IntegerType =>
+  ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
+case StringType =>
+  ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
+stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
+case BinaryType =>
+  ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / 
binarySeq.length.toDouble,
+binarySeq.map(_.length).max.toLong))
+case BooleanType =>
+  ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
+booleanSeq.count(_.equals(false)).toLong))
   }
+  (f, colStat)
+}
+(df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+  tableName: String,
+  isDataSourceTable: Boolean,
+  expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+val readback = spark.table(tableName)
+val stats = readback.queryExecution.analyzed.collect {
+  case rel: MetastoreRelation =>
+assert(!isDataSourceTable, "Expected a Hive serde table, but got a 
data source table")
+rel.catalogTable.stats.get
+  case rel: LogicalRelation =>
+assert(isDataSourceTable, "Expected a data source table, but got a 
Hive serde table")
+rel.catalogTable.get.stats.get
+}
+assert(stats.length == 1)
+val columnStats = stats.head.colStats
+assert(columnStats.size == expectedColStatsSeq.length)
+expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+  StatisticsTest.checkColStat(
+dataType = field.dataType,
+

[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-10 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82731979
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
--- End diff --

Could you deduplicate the two test cases `refreshing table stats` and 
`refreshing column stats` by calling the same common function?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-10 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82731661
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 ---
@@ -62,7 +62,7 @@ case class AnalyzeColumnCommand(
   val statistics = Statistics(
 sizeInBytes = newTotalSize,
 rowCount = Some(rowCount),
-colStats = columnStats ++ 
catalogTable.stats.map(_.colStats).getOrElse(Map()))
+colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ 
columnStats)
--- End diff --

Could you leave a code comment here to emphasize it?  I am just afraid this 
might be modified without notice. Newly computed stats should override the 
existing stats.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-10 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82730881
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  val stats1 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats1.sizeInBytes > 0)
+  assert(stats1.rowCount.contains(1))
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  catalog.lookupRelation(tableIndent)
+  val stats2 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats2.sizeInBytes > stats1.sizeInBytes)
+  assert(stats2.rowCount.contains(2))
+}
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
 import testImplicits._
 
 val intSeq = Seq(1, 2)
 val stringSeq = Seq("a", "bb")
+val binarySeq = Seq("a", "bb").map(_.getBytes)
 val booleanSeq = Seq(true, false)
-
 val data = intSeq.indices.map { i =>
-  (intSeq(i), stringSeq(i), booleanSeq(i))
+  (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
 }
-val tableName = "table"
-withTable(tableName) {
-  val df = data.toDF("c1", "c2", "c3")
-  df.write.format("parquet").saveAsTable(tableName)
-  val expectedColStatsSeq = df.schema.map { f =>
-val colStat = f.dataType match {
-  case IntegerType =>
-ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
-  case StringType =>
-ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
-  stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
-  case BooleanType =>
-ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
-  booleanSeq.count(_.equals(false)).toLong))
-}
-(f, colStat)
+val df = data.toDF("c1", "c2", "c3", "c4")
+val expectedColStatsSeq = df.schema.map { f =>
+  val colStat = f.dataType match {
+case IntegerType =>
+  ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
+case StringType =>
+  ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
+stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
+case BinaryType =>
+  ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / 
binarySeq.length.toDouble,
+binarySeq.map(_.length).max.toLong))
+case BooleanType =>
+  ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
+booleanSeq.count(_.equals(false)).toLong))
   }
+  (f, colStat)
+}
+(df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+  tableName: String,
+  isDataSourceTable: Boolean,
+  expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+val readback = spark.table(tableName)
+val stats = readback.queryExecution.analyzed.collect {
+  case rel: MetastoreRelation =>
+assert(!isDataSourceTable, "Expected a Hive serde table, but got a 
data source table")
+rel.catalogTable.stats.get
+  case rel: LogicalRelation =>
+assert(isDataSourceTable, "Expected a data source table, but got a 
Hive serde table")
+rel.catalogTable.get.stats.get
+}
+assert(stats.length == 1)
+val columnStats = stats.head.colStats
+assert(columnStats.size == expectedColStatsSeq.length)
+expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+  StatisticsTest.checkColStat(
+dataType = field.dataType,

[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-10 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82730527
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -358,50 +358,180 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
-  test("generate column-level statistics and load them from hive 
metastore") {
+  test("test refreshing table stats of cached data source table by 
`ANALYZE TABLE` statement") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+  val stats1 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats1.sizeInBytes > 0)
+  assert(stats1.rowCount.contains(1))
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  catalog.lookupRelation(tableIndent)
+  val stats2 = 
catalog.getCachedDataSourceTable(tableIndent).asInstanceOf[LogicalRelation]
+.catalogTable.get.stats.get
+  assert(stats2.sizeInBytes > stats1.sizeInBytes)
+  assert(stats2.rowCount.contains(2))
+}
+  }
+
+  private def dataAndColStats(): (DataFrame, Seq[(StructField, 
ColumnStat)]) = {
 import testImplicits._
 
 val intSeq = Seq(1, 2)
 val stringSeq = Seq("a", "bb")
+val binarySeq = Seq("a", "bb").map(_.getBytes)
 val booleanSeq = Seq(true, false)
-
 val data = intSeq.indices.map { i =>
-  (intSeq(i), stringSeq(i), booleanSeq(i))
+  (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
 }
-val tableName = "table"
-withTable(tableName) {
-  val df = data.toDF("c1", "c2", "c3")
-  df.write.format("parquet").saveAsTable(tableName)
-  val expectedColStatsSeq = df.schema.map { f =>
-val colStat = f.dataType match {
-  case IntegerType =>
-ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
-  case StringType =>
-ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
-  stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
-  case BooleanType =>
-ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
-  booleanSeq.count(_.equals(false)).toLong))
-}
-(f, colStat)
+val df = data.toDF("c1", "c2", "c3", "c4")
+val expectedColStatsSeq = df.schema.map { f =>
+  val colStat = f.dataType match {
+case IntegerType =>
+  ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, 
intSeq.distinct.length.toLong))
+case StringType =>
+  ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / 
stringSeq.length.toDouble,
+stringSeq.map(_.length).max.toLong, 
stringSeq.distinct.length.toLong))
+case BinaryType =>
+  ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / 
binarySeq.length.toDouble,
+binarySeq.map(_.length).max.toLong))
+case BooleanType =>
+  ColumnStat(InternalRow(0L, 
booleanSeq.count(_.equals(true)).toLong,
+booleanSeq.count(_.equals(false)).toLong))
   }
+  (f, colStat)
+}
+(df, expectedColStatsSeq)
+  }
+
+  private def checkColStats(
+  tableName: String,
+  isDataSourceTable: Boolean,
+  expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
+val readback = spark.table(tableName)
+val stats = readback.queryExecution.analyzed.collect {
+  case rel: MetastoreRelation =>
+assert(!isDataSourceTable, "Expected a Hive serde table, but got a 
data source table")
+rel.catalogTable.stats.get
+  case rel: LogicalRelation =>
+assert(isDataSourceTable, "Expected a data source table, but got a 
Hive serde table")
+rel.catalogTable.get.stats.get
+}
+assert(stats.length == 1)
+val columnStats = stats.head.colStats
+assert(columnStats.size == expectedColStatsSeq.length)
+expectedColStatsSeq.foreach { case (field, expectedColStat) =>
+  StatisticsTest.checkColStat(
+dataType = field.dataType,

[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-07 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82431001
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 ---
@@ -62,7 +62,7 @@ case class AnalyzeColumnCommand(
   val statistics = Statistics(
 sizeInBytes = newTotalSize,
 rowCount = Some(rowCount),
-colStats = columnStats ++ 
catalogTable.stats.map(_.colStats).getOrElse(Map()))
+colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ 
columnStats)
--- End diff --

:) Improving the test case coverage is important. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-07 Thread wzhfy
Github user wzhfy commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82397073
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 ---
@@ -62,7 +62,7 @@ case class AnalyzeColumnCommand(
   val statistics = Statistics(
 sizeInBytes = newTotalSize,
 rowCount = Some(rowCount),
-colStats = columnStats ++ 
catalogTable.stats.map(_.colStats).getOrElse(Map()))
+colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ 
columnStats)
--- End diff --

yes


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-06 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82297698
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -405,6 +405,78 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
+  test("check column statistics for case sensitive columns") {
+val tableName = "tbl"
+// scalastyle:off
+// non ascii characters are not allowed in the source code, so we 
disable the scalastyle.
+val columnGroups: Seq[(String, String)] = Seq(("c1", "C1"), ("列c", 
"列C"))
+// scalastyle:on
+columnGroups.foreach { case (column1, column2) =>
+  withTable(tableName) {
+withSQLConf("spark.sql.caseSensitive" -> "true") {
+  sql(s"CREATE TABLE $tableName (`$column1` int, `$column2` 
double) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1, 3.0")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS 
`$column1`, `$column2`")
+  val readback = spark.table(tableName)
+  val relations = readback.queryExecution.analyzed.collect { case 
rel: LogicalRelation =>
+val columnStats = rel.catalogTable.get.stats.get.colStats
+assert(columnStats.size == 2)
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = columnStats(column1),
+  expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+StatisticsTest.checkColStat(
+  dataType = DoubleType,
+  colStat = columnStats(column2),
+  expectedColStat = ColumnStat(InternalRow(0L, 3.0d, 3.0d, 
1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+rel
+  }
+  assert(relations.size == 1)
+}
+  }
+}
+  }
+
+  test("test refreshing statistics of cached data source table") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+
+  val cachedTable1 = catalog.getCachedDataSourceTable(tableIndent)
+  assert(cachedTable1.statistics.sizeInBytes > 0)
+  assert(cachedTable1.statistics.rowCount.contains(1))
+  StatisticsTest.checkColStat(
+dataType = IntegerType,
+colStat = cachedTable1.statistics.colStats("key"),
+expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+rsd = spark.sessionState.conf.ndvMaxError)
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
--- End diff --

The above both DDL will call `refreshTable` with the same table name. 
Right? If the source codes remove any `refreshTable`, the test case still 
passes. Right?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-06 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82297526
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -405,6 +405,78 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
+  test("check column statistics for case sensitive columns") {
+val tableName = "tbl"
+// scalastyle:off
+// non ascii characters are not allowed in the source code, so we 
disable the scalastyle.
+val columnGroups: Seq[(String, String)] = Seq(("c1", "C1"), ("列c", 
"列C"))
+// scalastyle:on
+columnGroups.foreach { case (column1, column2) =>
+  withTable(tableName) {
+withSQLConf("spark.sql.caseSensitive" -> "true") {
+  sql(s"CREATE TABLE $tableName (`$column1` int, `$column2` 
double) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1, 3.0")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS 
`$column1`, `$column2`")
+  val readback = spark.table(tableName)
+  val relations = readback.queryExecution.analyzed.collect { case 
rel: LogicalRelation =>
+val columnStats = rel.catalogTable.get.stats.get.colStats
+assert(columnStats.size == 2)
+StatisticsTest.checkColStat(
+  dataType = IntegerType,
+  colStat = columnStats(column1),
+  expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+StatisticsTest.checkColStat(
+  dataType = DoubleType,
+  colStat = columnStats(column2),
+  expectedColStat = ColumnStat(InternalRow(0L, 3.0d, 3.0d, 
1L)),
+  rsd = spark.sessionState.conf.ndvMaxError)
+rel
+  }
+  assert(relations.size == 1)
+}
+  }
+}
+  }
+
+  test("test refreshing statistics of cached data source table") {
+val tableName = "tbl"
+withTable(tableName) {
+  val tableIndent = TableIdentifier(tableName, Some("default"))
+  val catalog = 
spark.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
+  sql(s"CREATE TABLE $tableName (key int) USING PARQUET")
+  sql(s"INSERT INTO $tableName SELECT 1")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key")
+  // Table lookup will make the table cached.
+  catalog.lookupRelation(tableIndent)
+
+  val cachedTable1 = catalog.getCachedDataSourceTable(tableIndent)
+  assert(cachedTable1.statistics.sizeInBytes > 0)
+  assert(cachedTable1.statistics.rowCount.contains(1))
+  StatisticsTest.checkColStat(
+dataType = IntegerType,
+colStat = cachedTable1.statistics.colStats("key"),
+expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
+rsd = spark.sessionState.conf.ndvMaxError)
+
+  sql(s"INSERT INTO $tableName SELECT 2")
+  sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
--- End diff --

What is the purpose of this DDL?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-06 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82293578
  
--- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 ---
@@ -62,7 +62,7 @@ case class AnalyzeColumnCommand(
   val statistics = Statistics(
 sizeInBytes = newTotalSize,
 rowCount = Some(rowCount),
-colStats = columnStats ++ 
catalogTable.stats.map(_.colStats).getOrElse(Map()))
+colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ 
columnStats)
--- End diff --

Is this a bug exposed by the newly added test case?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-06 Thread gatorsmile
Github user gatorsmile commented on a diff in the pull request:

https://github.com/apache/spark/pull/15360#discussion_r82279052
  
--- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -405,6 +405,78 @@ class StatisticsSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils
 }
   }
 
+  test("check column statistics for case sensitive columns") {
+val tableName = "tbl"
+// scalastyle:off
+// non ascii characters are not allowed in the source code, so we 
disable the scalastyle.
+val columnGroups: Seq[(String, String)] = Seq(("c1", "C1"), ("列c", 
"列C"))
+// scalastyle:on
+columnGroups.foreach { case (column1, column2) =>
--- End diff --

Could you create a separate function for the following checking logics? 
Then, you can have two test cases without duplicate codes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #15360: [SPARK-17073] [SQL] [FOLLOWUP] generate column-le...

2016-10-04 Thread wzhfy
GitHub user wzhfy opened a pull request:

https://github.com/apache/spark/pull/15360

[SPARK-17073] [SQL] [FOLLOWUP] generate column-level statistics

## What changes were proposed in this pull request?
This pr adds some test cases for statistics: case sensitive column names, 
non ascii column names, refresh table, and also improves some documentation.

## How was this patch tested?
add test cases


You can merge this pull request into a Git repository by running:

$ git pull https://github.com/wzhfy/spark colStats2

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/15360.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #15360


commit 0ad7c8837d0ef860e398349652f7589870358c14
Author: Zhenhua Wang 
Date:   2016-10-05T06:06:56Z

add test cases and improve documentation




---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org