Repository: spark Updated Branches: refs/heads/master d68ea24d6 -> 17791a581
[SPARK-7783] [SQL] [PySpark] add DataFrame.rollup/cube in Python Author: Davies Liu <[email protected]> Closes #6311 from davies/rollup and squashes the following commits: 0261db1 [Davies Liu] use @since a51ca6b [Davies Liu] Merge branch 'master' of github.com:apache/spark into rollup 8ad5af4 [Davies Liu] Update dataframe.py ade3841 [Davies Liu] add DataFrame.rollup/cube in Python Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17791a58 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17791a58 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17791a58 Branch: refs/heads/master Commit: 17791a58159b3e4619d0367f54a4c5332342658b Parents: d68ea24 Author: Davies Liu <[email protected]> Authored: Thu May 21 17:43:08 2015 -0700 Committer: Reynold Xin <[email protected]> Committed: Thu May 21 17:43:08 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/dataframe.py | 48 ++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/17791a58/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 3fc7d00..132db90 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -801,9 +801,53 @@ class DataFrame(object): >>> df.groupBy(['name', df.age]).count().collect() [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)] """ - jdf = self._jdf.groupBy(self._jcols(*cols)) + jgd = self._jdf.groupBy(self._jcols(*cols)) from pyspark.sql.group import GroupedData - return GroupedData(jdf, self.sql_ctx) + return GroupedData(jgd, self.sql_ctx) + + @since(1.4) + def rollup(self, *cols): + """ + Create a multi-dimensional rollup for the current :class:`DataFrame` using + the specified columns, so we can run aggregation on them. + + >>> df.rollup('name', df.age).count().show() + +-----+----+-----+ + | name| age|count| + +-----+----+-----+ + |Alice|null| 1| + | Bob| 5| 1| + | Bob|null| 1| + | null|null| 2| + |Alice| 2| 1| + +-----+----+-----+ + """ + jgd = self._jdf.rollup(self._jcols(*cols)) + from pyspark.sql.group import GroupedData + return GroupedData(jgd, self.sql_ctx) + + @since(1.4) + def cube(self, *cols): + """ + Create a multi-dimensional cube for the current :class:`DataFrame` using + the specified columns, so we can run aggregation on them. + + >>> df.cube('name', df.age).count().show() + +-----+----+-----+ + | name| age|count| + +-----+----+-----+ + | null| 2| 1| + |Alice|null| 1| + | Bob| 5| 1| + | Bob|null| 1| + | null| 5| 1| + | null|null| 2| + |Alice| 2| 1| + +-----+----+-----+ + """ + jgd = self._jdf.cube(self._jcols(*cols)) + from pyspark.sql.group import GroupedData + return GroupedData(jgd, self.sql_ctx) @since(1.3) def agg(self, *exprs): --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
