This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new dfede11  [SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation 
warnings for getItem instead
dfede11 is described below

commit dfede11d121519a4d2bfcf79639d5bbdbd80c0f0
Author: HyukjinKwon <gurwls...@apache.org>
AuthorDate: Mon Apr 27 14:49:22 2020 +0900

    [SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation warnings for getItem 
instead
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to use a different approach instead of breaking it per 
Micheal's rubric added at https://spark.apache.org/versioning-policy.html. It 
deprecates the behaviour for now. It will be gradually removed in the future 
releases.
    
    After this change,
    
    ```python
    import warnings
    warnings.simplefilter("always")
    from pyspark.sql.functions import *
    df = spark.range(2)
    map_col = create_map(lit(0), lit(100), lit(1), lit(200))
    df.withColumn("mapped", map_col.getItem(col('id'))).show()
    ```
    
    ```
    /.../python/pyspark/sql/column.py:311: DeprecationWarning: A column as 
'key' in getItem is
    deprecated as of Spark 3.0, and will not be supported in the future 
release. Use `column[key]`
    or `column.key` syntax instead.
      DeprecationWarning)
    ...
    ```
    
    ```python
    import warnings
    warnings.simplefilter("always")
    from pyspark.sql.functions import *
    df = spark.range(2)
    struct_col = struct(lit(0), lit(100), lit(1), lit(200))
    df.withColumn("struct", struct_col.getField(lit("col1"))).show()
    ```
    
    ```
    /.../spark/python/pyspark/sql/column.py:336: DeprecationWarning: A column 
as 'name'
    in getField is deprecated as of Spark 3.0, and will not be supported in the 
future release. Use
    `column[name]` or `column.name` syntax instead.
      DeprecationWarning)
    ```
    
    ### Why are the changes needed?
    
    To prevent the radical behaviour change after the amended versioning policy.
    
    ### Does this PR introduce any user-facing change?
    
    Yes, it will show the deprecated warning message.
    
    ### How was this patch tested?
    
    Manually tested.
    
    Closes #28327 from HyukjinKwon/SPARK-29664.
    
    Authored-by: HyukjinKwon <gurwls...@apache.org>
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
    (cherry picked from commit 5dd581c88ab111377175b673994153072fe9ec77)
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
---
 docs/pyspark-migration-guide.md         |  2 --
 python/pyspark/sql/column.py            | 23 +++++++++++++++++------
 python/pyspark/sql/tests/test_column.py | 14 +-------------
 3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md
index 92388ff..6f0fbbf 100644
--- a/docs/pyspark-migration-guide.md
+++ b/docs/pyspark-migration-guide.md
@@ -43,8 +43,6 @@ Please refer [Migration Guide: SQL, Datasets and 
DataFrame](sql-migration-guide.
    
 - In Spark 3.0, `createDataFrame(..., verifySchema=True)` validates `LongType` 
as well in PySpark. Previously, `LongType` was not verified and resulted in 
`None` in case the value overflows. To restore this behavior, `verifySchema` 
can be set to `False` to disable the validation.
 
-- In Spark 3.0, `Column.getItem` is fixed such that it does not call 
`Column.apply`. Consequently, if `Column` is used as an argument to `getItem`, 
the indexing operator should be used. For example, `map_col.getItem(col('id'))` 
should be replaced with `map_col[col('id')]`.
-
 - As of Spark 3.0, `Row` field names are no longer sorted alphabetically when 
constructing with named arguments for Python versions 3.6 and above, and the 
order of fields will match that as entered. To enable sorted fields by default, 
as in Spark 2.4, set the environment variable 
`PYSPARK_ROW_FIELD_SORTING_ENABLED` to `true` for both executors and driver - 
this environment variable must be consistent on all executors and driver; 
otherwise, it may cause failures or incorrect answers. For  [...]
 
 ## Upgrading from PySpark 2.3 to 2.4
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 9b728b3..ef4944c 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -17,11 +17,14 @@
 
 import sys
 import json
+import warnings
 
 if sys.version >= '3':
     basestring = str
     long = int
 
+from py4j.java_gateway import is_instance_of
+
 from pyspark import copy_func, since
 from pyspark.context import SparkContext
 from pyspark.rdd import ignore_unicode_prefix
@@ -296,12 +299,14 @@ class Column(object):
         +----+------+
         |   1| value|
         +----+------+
-
-        .. versionchanged:: 3.0
-           If `key` is a `Column` object, the indexing operator should be used 
instead.
-           For example, `map_col.getItem(col('id'))` should be replaced with 
`map_col[col('id')]`.
         """
-        return _bin_op("getItem")(self, key)
+        if isinstance(key, Column):
+            warnings.warn(
+                "A column as 'key' in getItem is deprecated as of Spark 3.0, 
and will not "
+                "be supported in the future release. Use `column[key]` or 
`column.key` syntax "
+                "instead.",
+                DeprecationWarning)
+        return self[key]
 
     @since(1.3)
     def getField(self, name):
@@ -323,12 +328,18 @@ class Column(object):
         |  1|
         +---+
         """
+        if isinstance(name, Column):
+            warnings.warn(
+                "A column as 'name' in getField is deprecated as of Spark 3.0, 
and will not "
+                "be supported in the future release. Use `column[name]` or 
`column.name` syntax "
+                "instead.",
+                DeprecationWarning)
         return self[name]
 
     def __getattr__(self, item):
         if item.startswith("__"):
             raise AttributeError(item)
-        return self.getField(item)
+        return self[item]
 
     def __getitem__(self, k):
         if isinstance(k, slice):
diff --git a/python/pyspark/sql/tests/test_column.py 
b/python/pyspark/sql/tests/test_column.py
index d9d9331..58bf896 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -18,8 +18,6 @@
 
 import sys
 
-from py4j.protocol import Py4JJavaError
-
 from pyspark.sql import Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.utils import AnalysisException
@@ -87,7 +85,7 @@ class ColumnTests(ReusedSQLTestCase):
                                 "Cannot apply 'in' operator against a column",
                                 lambda: 1 in cs)
 
-    def test_column_apply(self):
+    def test_column_accessor(self):
         from pyspark.sql.functions import col
 
         self.assertIsInstance(col("foo")[1:3], Column)
@@ -95,16 +93,6 @@ class ColumnTests(ReusedSQLTestCase):
         self.assertIsInstance(col("foo")["bar"], Column)
         self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
 
-    def test_column_getitem(self):
-        from pyspark.sql.functions import col, create_map, lit
-
-        map_col = create_map(lit(0), lit(100), lit(1), lit(200))
-        self.assertRaisesRegexp(
-            Py4JJavaError,
-            "Unsupported literal type class org.apache.spark.sql.Column id",
-            lambda: map_col.getItem(col('id'))
-        )
-
     def test_column_select(self):
         df = self.df
         self.assertEqual(self.testData, df.select("*").collect())


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to