git commit: SPARK-2180: support HAVING clauses in Hive queries

rxin Fri, 20 Jun 2014 13:42:06 -0700

Repository: spark
Updated Branches:
  refs/heads/master 6a224c31e -> 171ebb3a8



SPARK-2180:  support HAVING clauses in Hive queries

This PR extends Spark's HiveQL support to handle HAVING clauses in 
aggregations.  The HAVING test from the Hive compatibility suite doesn't appear 
to be runnable from within Spark, so I added a simple comparable test to 
`HiveQuerySuite`.

Author: William Benton <[email protected]>

Closes #1136 from willb/SPARK-2180 and squashes the following commits:

3bbaf26 [William Benton] Added casts to HAVING expressions
83f1340 [William Benton] scalastyle fixes
18387f1 [William Benton] Add test for HAVING without GROUP BY
b880bef [William Benton] Added semantic error for HAVING without GROUP BY
942428e [William Benton] Added test coverage for SPARK-2180.
56084cc [William Benton] Add support for HAVING clauses in Hive queries.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/171ebb3a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/171ebb3a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/171ebb3a

Branch: refs/heads/master
Commit: 171ebb3a824a577d69443ec68a3543b27914cf6d
Parents: 6a224c3
Author: William Benton <[email protected]>
Authored: Fri Jun 20 13:41:38 2014 -0700
Committer: Reynold Xin <[email protected]>
Committed: Fri Jun 20 13:41:38 2014 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/hive/HiveQl.scala      | 30 ++++++++++++++++----
 .../sql/hive/execution/HiveQuerySuite.scala     | 29 +++++++++++++++++++
 2 files changed, 53 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/171ebb3a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index ec653ef..c69e3db 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -204,6 +204,9 @@ private[hive] object HiveQl {
   class ParseException(sql: String, cause: Throwable)
     extends Exception(s"Failed to parse: $sql", cause)
 
+  class SemanticException(msg: String)
+    extends Exception(s"Error in semantic analysis: $msg")
+
   /**
    * Returns the AST for the given SQL string.
    */
@@ -480,6 +483,7 @@ private[hive] object HiveQl {
             whereClause ::
             groupByClause ::
             orderByClause ::
+            havingClause ::
             sortByClause ::
             clusterByClause ::
             distributeByClause ::
@@ -494,6 +498,7 @@ private[hive] object HiveQl {
               "TOK_WHERE",
               "TOK_GROUPBY",
               "TOK_ORDERBY",
+              "TOK_HAVING",
               "TOK_SORTBY",
               "TOK_CLUSTERBY",
               "TOK_DISTRIBUTEBY",
@@ -576,21 +581,34 @@ private[hive] object HiveQl {
         val withDistinct =
           if (selectDistinctClause.isDefined) Distinct(withProject) else 
withProject
 
+        val withHaving = havingClause.map { h => 
+
+          if (groupByClause == None) {
+            throw new SemanticException("HAVING specified without GROUP BY")
+          }
+
+          val havingExpr = h.getChildren.toSeq match {
+            case Seq(hexpr) => nodeToExpr(hexpr)
+          }
+          
+          Filter(Cast(havingExpr, BooleanType), withDistinct)
+        }.getOrElse(withDistinct)
+
         val withSort =
           (orderByClause, sortByClause, distributeByClause, clusterByClause) 
match {
             case (Some(totalOrdering), None, None, None) =>
-              Sort(totalOrdering.getChildren.map(nodeToSortOrder), 
withDistinct)
+              Sort(totalOrdering.getChildren.map(nodeToSortOrder), withHaving)
             case (None, Some(perPartitionOrdering), None, None) =>
-              
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), 
withDistinct)
+              
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), 
withHaving)
             case (None, None, Some(partitionExprs), None) =>
-              Repartition(partitionExprs.getChildren.map(nodeToExpr), 
withDistinct)
+              Repartition(partitionExprs.getChildren.map(nodeToExpr), 
withHaving)
             case (None, Some(perPartitionOrdering), Some(partitionExprs), 
None) =>
               
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder),
-                Repartition(partitionExprs.getChildren.map(nodeToExpr), 
withDistinct))
+                Repartition(partitionExprs.getChildren.map(nodeToExpr), 
withHaving))
             case (None, None, None, Some(clusterExprs)) =>
               
SortPartitions(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, 
Ascending)),
-                Repartition(clusterExprs.getChildren.map(nodeToExpr), 
withDistinct))
-            case (None, None, None, None) => withDistinct
+                Repartition(clusterExprs.getChildren.map(nodeToExpr), 
withHaving))
+            case (None, None, None, None) => withHaving
             case _ => sys.error("Unsupported set of ordering / distribution 
clauses.")
           }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/171ebb3a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 9f5cf28..8018509 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -224,6 +224,32 @@ class HiveQuerySuite extends HiveComparisonTest {
     TestHive.reset()
   }
 
+  test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
+    val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
+      .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, 
value, attr)}
+    
+    TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test")
+    
+    val results = 
+      hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value 
HAVING attr > 3")
+      .collect()
+      .map(x => Pair(x.getString(0), x.getInt(1)))
+    
+    assert(results === Array(Pair("foo", 4)))
+    
+    TestHive.reset()
+  }
+
+  test("SPARK-2180:  HAVING without GROUP BY raises exception") {
+    intercept[Exception] {
+      hql("SELECT value, attr FROM having_test HAVING attr > 3")
+    }
+  }
+  
+  test("SPARK-2180:  HAVING with non-boolean clause raises no exceptions") {
+    val results = hql("select key, count(*) c from src group by key having 
c").collect()
+  }
+
   test("Query Hive native command execution result") {
     val tableName = "test_native_commands"
 
@@ -441,3 +467,6 @@ class HiveQuerySuite extends HiveComparisonTest {
   // since they modify /clear stuff.
 
 }
+
+// for SPARK-2180 test
+case class HavingRow(key: Int, value: String, attr: Int)

git commit: SPARK-2180: support HAVING clauses in Hive queries

Reply via email to