Repository: spark Updated Branches: refs/heads/branch-1.5 829c33a4b -> afaed7ef4
[SPARK-10073] [SQL] Python withColumn should replace the old column DataFrame.withColumn in Python should be consistent with the Scala one (replacing the existing column that has the same name). cc marmbrus Author: Davies Liu <[email protected]> Closes #8300 from davies/with_column. (cherry picked from commit 08887369c890e0dd87eb8b34e8c32bb03307bf24) Signed-off-by: Michael Armbrust <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afaed7ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afaed7ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afaed7ef Branch: refs/heads/branch-1.5 Commit: afaed7ef49751e2002b84da25abe08fb8987372c Parents: 829c33a Author: Davies Liu <[email protected]> Authored: Wed Aug 19 13:56:40 2015 -0700 Committer: Michael Armbrust <[email protected]> Committed: Wed Aug 19 13:56:54 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/dataframe.py | 12 ++++++------ python/pyspark/sql/tests.py | 4 ++++ .../src/main/scala/org/apache/spark/sql/DataFrame.scala | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index da742d7..025811f 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1202,7 +1202,9 @@ class DataFrame(object): @ignore_unicode_prefix @since(1.3) def withColumn(self, colName, col): - """Returns a new :class:`DataFrame` by adding a column. + """ + Returns a new :class:`DataFrame` by adding a column or replacing the + existing column that has the same name. :param colName: string, name of the new column. :param col: a :class:`Column` expression for the new column. @@ -1210,7 +1212,8 @@ class DataFrame(object): >>> df.withColumn('age2', df.age + 2).collect() [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)] """ - return self.select('*', col.alias(colName)) + assert isinstance(col, Column), "col should be Column" + return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) @ignore_unicode_prefix @since(1.3) @@ -1223,10 +1226,7 @@ class DataFrame(object): >>> df.withColumnRenamed('age', 'age2').collect() [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')] """ - cols = [Column(_to_java_column(c)).alias(new) - if c == existing else c - for c in self.columns] - return self.select(*cols) + return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx) @since(1.4) @ignore_unicode_prefix http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/python/pyspark/sql/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 13cf647..aacfb34 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1035,6 +1035,10 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", lambda: df.select(sha2(df.a, 1024)).collect()) + def test_with_column_with_existing_name(self): + keys = self.df.withColumn("key", self.df.key).select("key").collect() + self.assertEqual([r.key for r in keys], list(range(100))) + class HiveContextSQLTests(ReusedPySparkTestCase): http://git-wip-us.apache.org/repos/asf/spark/blob/afaed7ef/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index ec5084a..5bed299 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1133,7 +1133,8 @@ class DataFrame private[sql]( ///////////////////////////////////////////////////////////////////////////// /** - * Returns a new [[DataFrame]] by adding a column. + * Returns a new [[DataFrame]] by adding a column or replacing the existing column that has + * the same name. * @group dfops * @since 1.3.0 */ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
