Repository: spark Updated Branches: refs/heads/branch-1.4 0dbfe1681 -> eaa611620
[SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames. Author: Reynold Xin <r...@databricks.com> Closes #6062 from rxin/agg-retain-doc and squashes the following commits: 43e511e [Reynold Xin] [SPARK-7462][SQL] Update documentation for retaining grouping columns in DataFrames. (cherry picked from commit 3a9b6997df3fef1052d8c410f32319018c52acff) Signed-off-by: Reynold Xin <r...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eaa61162 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eaa61162 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eaa61162 Branch: refs/heads/branch-1.4 Commit: eaa61162003fdaf345e8ba04987e0edcee03ca8e Parents: 0dbfe16 Author: Reynold Xin <r...@databricks.com> Authored: Mon May 11 18:07:12 2015 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Mon May 11 18:07:19 2015 -0700 ---------------------------------------------------------------------- docs/sql-programming-guide.md | 60 +++++++++++++++++++- python/pyspark/sql/_types.py | 2 + .../org/apache/spark/sql/GroupedData.scala | 14 ++++- 3 files changed, 73 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/eaa61162/docs/sql-programming-guide.md ---------------------------------------------------------------------- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 6af1043..6b7b867 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1594,6 +1594,64 @@ options. # Migration Guide +## Upgrading from Spark SQL 1.3 to 1.4 + +Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`. + +<div class="codetabs"> +<div data-lang="scala" markdown="1"> +{% highlight scala %} + +// In 1.3.x, in order for the grouping column "department" to show up, +// it must be included explicitly as part of the agg function call. +df.groupBy("department").agg($"department", max("age"), sum("expense")) + +// In 1.4+, grouping column "department" is included automatically. +df.groupBy("department").agg(max("age"), sum("expense")) + +// Revert to 1.3 behavior (not retaining grouping column) by: +sqlContext.setConf("spark.sql.retainGroupColumns", "false") + +{% endhighlight %} +</div> + +<div data-lang="java" markdown="1"> +{% highlight java %} + +// In 1.3.x, in order for the grouping column "department" to show up, +// it must be included explicitly as part of the agg function call. +df.groupBy("department").agg(col("department"), max("age"), sum("expense")); + +// In 1.4+, grouping column "department" is included automatically. +df.groupBy("department").agg(max("age"), sum("expense")); + +// Revert to 1.3 behavior (not retaining grouping column) by: +sqlContext.setConf("spark.sql.retainGroupColumns", "false"); + +{% endhighlight %} +</div> + +<div data-lang="python" markdown="1"> +{% highlight python %} + +import pyspark.sql.functions as func + +# In 1.3.x, in order for the grouping column "department" to show up, +# it must be included explicitly as part of the agg function call. +df.groupBy("department").agg("department"), func.max("age"), func.sum("expense")) + +# In 1.4+, grouping column "department" is included automatically. +df.groupBy("department").agg(func.max("age"), func.sum("expense")) + +# Revert to 1.3.x behavior (not retaining grouping column) by: +sqlContext.setConf("spark.sql.retainGroupColumns", "false") + +{% endhighlight %} +</div> + +</div> + + ## Upgrading from Spark SQL 1.0-1.2 to 1.3 In Spark 1.3 we removed the "Alpha" label from Spark SQL and as part of this did a cleanup of the @@ -1651,7 +1709,7 @@ moved into the udf object in `SQLContext`. <div class="codetabs"> <div data-lang="scala" markdown="1"> -{% highlight java %} +{% highlight scala %} sqlContext.udf.register("strLen", (s: String) => s.length()) http://git-wip-us.apache.org/repos/asf/spark/blob/eaa61162/python/pyspark/sql/_types.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/_types.py b/python/pyspark/sql/_types.py index fd98e11..b96851a 100644 --- a/python/pyspark/sql/_types.py +++ b/python/pyspark/sql/_types.py @@ -1228,12 +1228,14 @@ class Row(tuple): raise AttributeError(item) def __reduce__(self): + """Returns a tuple so Python knows how to pickle Row.""" if hasattr(self, "__fields__"): return (_create_row, (self.__fields__, tuple(self))) else: return tuple.__reduce__(self) def __repr__(self): + """Printable representation of Row used in Python REPL.""" if hasattr(self, "__fields__"): return "Row(%s)" % ", ".join("%s=%r" % (k, v) for k, v in zip(self.__fields__, tuple(self))) http://git-wip-us.apache.org/repos/asf/spark/blob/eaa61162/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala index 003a620..543320e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala @@ -146,11 +146,21 @@ class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) * * // Scala: * import org.apache.spark.sql.functions._ - * df.groupBy("department").agg($"department", max($"age"), sum($"expense")) + * df.groupBy("department").agg(max("age"), sum("expense")) * * // Java: * import static org.apache.spark.sql.functions.*; - * df.groupBy("department").agg(col("department"), max(col("age")), sum(col("expense"))); + * df.groupBy("department").agg(max("age"), sum("expense")); + * }}} + * + * Note that before Spark 1.4, the default behavior is to NOT retain grouping columns. To change + * to that behavior, set config variable `spark.sql.retainGroupColumns` to `false`. + * {{{ + * // Scala, 1.3.x: + * df.groupBy("department").agg($"department", max("age"), sum("expense")) + * + * // Java, 1.3.x: + * df.groupBy("department").agg(col("department"), max("age"), sum("expense")); * }}} */ @scala.annotation.varargs --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org