Repository: spark Updated Branches: refs/heads/branch-1.0 291567d60 -> aeffc2004
SPARK-1822: Some minor cleanup work on SchemaRDD.count() Minor cleanup following #841. Author: Reynold Xin <[email protected]> Closes #868 from rxin/schema-count and squashes the following commits: 5442651 [Reynold Xin] SPARK-1822: Some minor cleanup work on SchemaRDD.count() (cherry picked from commit d66642e3978a76977414c2fdaedebaad35662667) Signed-off-by: Reynold Xin <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aeffc200 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aeffc200 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aeffc200 Branch: refs/heads/branch-1.0 Commit: aeffc2004e2ee4b8f060c44575a90e1dbcf901e3 Parents: 291567d Author: Reynold Xin <[email protected]> Authored: Sun May 25 01:44:49 2014 -0700 Committer: Reynold Xin <[email protected]> Committed: Sun May 25 01:45:01 2014 -0700 ---------------------------------------------------------------------- python/pyspark/sql.py | 5 ++++- sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala | 8 ++++---- .../src/test/scala/org/apache/spark/sql/DslQuerySuite.scala | 2 +- sql/core/src/test/scala/org/apache/spark/sql/TestData.scala | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/aeffc200/python/pyspark/sql.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index f2001af..fa4b9c7 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -323,7 +323,10 @@ class SchemaRDD(RDD): def count(self): """ - Return the number of elements in this RDD. + Return the number of elements in this RDD. Unlike the base RDD + implementation of count, this implementation leverages the query + optimizer to compute the count on the SchemaRDD, which supports + features such as filter pushdown. >>> srdd = sqlCtx.inferSchema(rdd) >>> srdd.count() http://git-wip-us.apache.org/repos/asf/spark/blob/aeffc200/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 452da3d..9883ebc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -276,12 +276,12 @@ class SchemaRDD( /** * :: Experimental :: - * Overriding base RDD implementation to leverage query optimizer + * Return the number of elements in the RDD. Unlike the base RDD implementation of count, this + * implementation leverages the query optimizer to compute the count on the SchemaRDD, which + * supports features such as filter pushdown. */ @Experimental - override def count(): Long = { - groupBy()(Count(Literal(1))).collect().head.getLong(0) - } + override def count(): Long = groupBy()(Count(Literal(1))).collect().head.getLong(0) /** * :: Experimental :: http://git-wip-us.apache.org/repos/asf/spark/blob/aeffc200/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala index 233132a..94ba13b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala @@ -124,7 +124,7 @@ class DslQuerySuite extends QueryTest { } test("zero count") { - assert(testData4.count() === 0) + assert(emptyTableData.count() === 0) } test("inner join where, one match per row") { http://git-wip-us.apache.org/repos/asf/spark/blob/aeffc200/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index b1eecb4..944f520 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -47,7 +47,7 @@ object TestData { (1, null) :: (2, 2) :: Nil) - val testData4 = logical.LocalRelation('a.int, 'b.int) + val emptyTableData = logical.LocalRelation('a.int, 'b.int) case class UpperCaseData(N: Int, L: String) val upperCaseData =
