This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new dfede11 [SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation
warnings for getItem instead
dfede11 is described below
commit dfede11d121519a4d2bfcf79639d5bbdbd80c0f0
Author: HyukjinKwon <[email protected]>
AuthorDate: Mon Apr 27 14:49:22 2020 +0900
[SPARK-29664][PYTHON][SQL][FOLLOW-UP] Add deprecation warnings for getItem
instead
### What changes were proposed in this pull request?
This PR proposes to use a different approach instead of breaking it per
Micheal's rubric added at https://spark.apache.org/versioning-policy.html. It
deprecates the behaviour for now. It will be gradually removed in the future
releases.
After this change,
```python
import warnings
warnings.simplefilter("always")
from pyspark.sql.functions import *
df = spark.range(2)
map_col = create_map(lit(0), lit(100), lit(1), lit(200))
df.withColumn("mapped", map_col.getItem(col('id'))).show()
```
```
/.../python/pyspark/sql/column.py:311: DeprecationWarning: A column as
'key' in getItem is
deprecated as of Spark 3.0, and will not be supported in the future
release. Use `column[key]`
or `column.key` syntax instead.
DeprecationWarning)
...
```
```python
import warnings
warnings.simplefilter("always")
from pyspark.sql.functions import *
df = spark.range(2)
struct_col = struct(lit(0), lit(100), lit(1), lit(200))
df.withColumn("struct", struct_col.getField(lit("col1"))).show()
```
```
/.../spark/python/pyspark/sql/column.py:336: DeprecationWarning: A column
as 'name'
in getField is deprecated as of Spark 3.0, and will not be supported in the
future release. Use
`column[name]` or `column.name` syntax instead.
DeprecationWarning)
```
### Why are the changes needed?
To prevent the radical behaviour change after the amended versioning policy.
### Does this PR introduce any user-facing change?
Yes, it will show the deprecated warning message.
### How was this patch tested?
Manually tested.
Closes #28327 from HyukjinKwon/SPARK-29664.
Authored-by: HyukjinKwon <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
(cherry picked from commit 5dd581c88ab111377175b673994153072fe9ec77)
Signed-off-by: HyukjinKwon <[email protected]>
---
docs/pyspark-migration-guide.md | 2 --
python/pyspark/sql/column.py | 23 +++++++++++++++++------
python/pyspark/sql/tests/test_column.py | 14 +-------------
3 files changed, 18 insertions(+), 21 deletions(-)
diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md
index 92388ff..6f0fbbf 100644
--- a/docs/pyspark-migration-guide.md
+++ b/docs/pyspark-migration-guide.md
@@ -43,8 +43,6 @@ Please refer [Migration Guide: SQL, Datasets and
DataFrame](sql-migration-guide.
- In Spark 3.0, `createDataFrame(..., verifySchema=True)` validates `LongType`
as well in PySpark. Previously, `LongType` was not verified and resulted in
`None` in case the value overflows. To restore this behavior, `verifySchema`
can be set to `False` to disable the validation.
-- In Spark 3.0, `Column.getItem` is fixed such that it does not call
`Column.apply`. Consequently, if `Column` is used as an argument to `getItem`,
the indexing operator should be used. For example, `map_col.getItem(col('id'))`
should be replaced with `map_col[col('id')]`.
-
- As of Spark 3.0, `Row` field names are no longer sorted alphabetically when
constructing with named arguments for Python versions 3.6 and above, and the
order of fields will match that as entered. To enable sorted fields by default,
as in Spark 2.4, set the environment variable
`PYSPARK_ROW_FIELD_SORTING_ENABLED` to `true` for both executors and driver -
this environment variable must be consistent on all executors and driver;
otherwise, it may cause failures or incorrect answers. For [...]
## Upgrading from PySpark 2.3 to 2.4
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 9b728b3..ef4944c 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -17,11 +17,14 @@
import sys
import json
+import warnings
if sys.version >= '3':
basestring = str
long = int
+from py4j.java_gateway import is_instance_of
+
from pyspark import copy_func, since
from pyspark.context import SparkContext
from pyspark.rdd import ignore_unicode_prefix
@@ -296,12 +299,14 @@ class Column(object):
+----+------+
| 1| value|
+----+------+
-
- .. versionchanged:: 3.0
- If `key` is a `Column` object, the indexing operator should be used
instead.
- For example, `map_col.getItem(col('id'))` should be replaced with
`map_col[col('id')]`.
"""
- return _bin_op("getItem")(self, key)
+ if isinstance(key, Column):
+ warnings.warn(
+ "A column as 'key' in getItem is deprecated as of Spark 3.0,
and will not "
+ "be supported in the future release. Use `column[key]` or
`column.key` syntax "
+ "instead.",
+ DeprecationWarning)
+ return self[key]
@since(1.3)
def getField(self, name):
@@ -323,12 +328,18 @@ class Column(object):
| 1|
+---+
"""
+ if isinstance(name, Column):
+ warnings.warn(
+ "A column as 'name' in getField is deprecated as of Spark 3.0,
and will not "
+ "be supported in the future release. Use `column[name]` or
`column.name` syntax "
+ "instead.",
+ DeprecationWarning)
return self[name]
def __getattr__(self, item):
if item.startswith("__"):
raise AttributeError(item)
- return self.getField(item)
+ return self[item]
def __getitem__(self, k):
if isinstance(k, slice):
diff --git a/python/pyspark/sql/tests/test_column.py
b/python/pyspark/sql/tests/test_column.py
index d9d9331..58bf896 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -18,8 +18,6 @@
import sys
-from py4j.protocol import Py4JJavaError
-
from pyspark.sql import Column, Row
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
@@ -87,7 +85,7 @@ class ColumnTests(ReusedSQLTestCase):
"Cannot apply 'in' operator against a column",
lambda: 1 in cs)
- def test_column_apply(self):
+ def test_column_accessor(self):
from pyspark.sql.functions import col
self.assertIsInstance(col("foo")[1:3], Column)
@@ -95,16 +93,6 @@ class ColumnTests(ReusedSQLTestCase):
self.assertIsInstance(col("foo")["bar"], Column)
self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
- def test_column_getitem(self):
- from pyspark.sql.functions import col, create_map, lit
-
- map_col = create_map(lit(0), lit(100), lit(1), lit(200))
- self.assertRaisesRegexp(
- Py4JJavaError,
- "Unsupported literal type class org.apache.spark.sql.Column id",
- lambda: map_col.getItem(col('id'))
- )
-
def test_column_select(self):
df = self.df
self.assertEqual(self.testData, df.select("*").collect())
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]