This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new dfede11 [SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation warnings for getItem instead dfede11 is described below commit dfede11d121519a4d2bfcf79639d5bbdbd80c0f0 Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Mon Apr 27 14:49:22 2020 +0900 [SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation warnings for getItem instead ### What changes were proposed in this pull request? This PR proposes to use a different approach instead of breaking it per Micheal's rubric added at https://spark.apache.org/versioning-policy.html. It deprecates the behaviour for now. It will be gradually removed in the future releases. After this change, ```python import warnings warnings.simplefilter("always") from pyspark.sql.functions import * df = spark.range(2) map_col = create_map(lit(0), lit(100), lit(1), lit(200)) df.withColumn("mapped", map_col.getItem(col('id'))).show() ``` ``` /.../python/pyspark/sql/column.py:311: DeprecationWarning: A column as 'key' in getItem is deprecated as of Spark 3.0, and will not be supported in the future release. Use `column[key]` or `column.key` syntax instead. DeprecationWarning) ... ``` ```python import warnings warnings.simplefilter("always") from pyspark.sql.functions import * df = spark.range(2) struct_col = struct(lit(0), lit(100), lit(1), lit(200)) df.withColumn("struct", struct_col.getField(lit("col1"))).show() ``` ``` /.../spark/python/pyspark/sql/column.py:336: DeprecationWarning: A column as 'name' in getField is deprecated as of Spark 3.0, and will not be supported in the future release. Use `column[name]` or `column.name` syntax instead. DeprecationWarning) ``` ### Why are the changes needed? To prevent the radical behaviour change after the amended versioning policy. ### Does this PR introduce any user-facing change? Yes, it will show the deprecated warning message. ### How was this patch tested? Manually tested. Closes #28327 from HyukjinKwon/SPARK-29664. Authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: HyukjinKwon <gurwls...@apache.org> (cherry picked from commit 5dd581c88ab111377175b673994153072fe9ec77) Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- docs/pyspark-migration-guide.md | 2 -- python/pyspark/sql/column.py | 23 +++++++++++++++++------ python/pyspark/sql/tests/test_column.py | 14 +------------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md index 92388ff..6f0fbbf 100644 --- a/docs/pyspark-migration-guide.md +++ b/docs/pyspark-migration-guide.md @@ -43,8 +43,6 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide. - In Spark 3.0, `createDataFrame(..., verifySchema=True)` validates `LongType` as well in PySpark. Previously, `LongType` was not verified and resulted in `None` in case the value overflows. To restore this behavior, `verifySchema` can be set to `False` to disable the validation. -- In Spark 3.0, `Column.getItem` is fixed such that it does not call `Column.apply`. Consequently, if `Column` is used as an argument to `getItem`, the indexing operator should be used. For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`. - - As of Spark 3.0, `Row` field names are no longer sorted alphabetically when constructing with named arguments for Python versions 3.6 and above, and the order of fields will match that as entered. To enable sorted fields by default, as in Spark 2.4, set the environment variable `PYSPARK_ROW_FIELD_SORTING_ENABLED` to `true` for both executors and driver - this environment variable must be consistent on all executors and driver; otherwise, it may cause failures or incorrect answers. For [...] ## Upgrading from PySpark 2.3 to 2.4 diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 9b728b3..ef4944c 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -17,11 +17,14 @@ import sys import json +import warnings if sys.version >= '3': basestring = str long = int +from py4j.java_gateway import is_instance_of + from pyspark import copy_func, since from pyspark.context import SparkContext from pyspark.rdd import ignore_unicode_prefix @@ -296,12 +299,14 @@ class Column(object): +----+------+ | 1| value| +----+------+ - - .. versionchanged:: 3.0 - If `key` is a `Column` object, the indexing operator should be used instead. - For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`. """ - return _bin_op("getItem")(self, key) + if isinstance(key, Column): + warnings.warn( + "A column as 'key' in getItem is deprecated as of Spark 3.0, and will not " + "be supported in the future release. Use `column[key]` or `column.key` syntax " + "instead.", + DeprecationWarning) + return self[key] @since(1.3) def getField(self, name): @@ -323,12 +328,18 @@ class Column(object): | 1| +---+ """ + if isinstance(name, Column): + warnings.warn( + "A column as 'name' in getField is deprecated as of Spark 3.0, and will not " + "be supported in the future release. Use `column[name]` or `column.name` syntax " + "instead.", + DeprecationWarning) return self[name] def __getattr__(self, item): if item.startswith("__"): raise AttributeError(item) - return self.getField(item) + return self[item] def __getitem__(self, k): if isinstance(k, slice): diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index d9d9331..58bf896 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -18,8 +18,6 @@ import sys -from py4j.protocol import Py4JJavaError - from pyspark.sql import Column, Row from pyspark.sql.types import * from pyspark.sql.utils import AnalysisException @@ -87,7 +85,7 @@ class ColumnTests(ReusedSQLTestCase): "Cannot apply 'in' operator against a column", lambda: 1 in cs) - def test_column_apply(self): + def test_column_accessor(self): from pyspark.sql.functions import col self.assertIsInstance(col("foo")[1:3], Column) @@ -95,16 +93,6 @@ class ColumnTests(ReusedSQLTestCase): self.assertIsInstance(col("foo")["bar"], Column) self.assertRaises(ValueError, lambda: col("foo")[0:10:2]) - def test_column_getitem(self): - from pyspark.sql.functions import col, create_map, lit - - map_col = create_map(lit(0), lit(100), lit(1), lit(200)) - self.assertRaisesRegexp( - Py4JJavaError, - "Unsupported literal type class org.apache.spark.sql.Column id", - lambda: map_col.getItem(col('id')) - ) - def test_column_select(self): df = self.df self.assertEqual(self.testData, df.select("*").collect()) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org