This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6729f497bc1 [SPARK-43476][SPARK-43477][SPARK-43478][PS] Support
`StringMethods` for pandas 2.0.0 and above
6729f497bc1 is described below
commit 6729f497bc191f827c743b043b2aa889a707c295
Author: itholic <[email protected]>
AuthorDate: Fri Aug 11 17:46:00 2023 +0800
[SPARK-43476][SPARK-43477][SPARK-43478][PS] Support `StringMethods` for
pandas 2.0.0 and above
### What changes were proposed in this pull request?
This PR proposes to support `StringMethods` for pandas 2.0.0 and above.
### Why are the changes needed?
Support the latest pandas for pandas API on Spark.
### Does this PR introduce _any_ user-facing change?
`StringMethods.split`, `StringMethods.rsplit` and `StringMethods.replace`
is available with the latest pandas.
### How was this patch tested?
Enabling the existing UTs.
Closes #42312 from itholic/pandas_str_split.
Lead-authored-by: itholic <[email protected]>
Co-authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../docs/source/migration_guide/pyspark_upgrade.rst | 1 +
python/pyspark/pandas/strings.py | 21 ++++++++-------------
python/pyspark/pandas/tests/test_series_string.py | 18 +++---------------
3 files changed, 12 insertions(+), 28 deletions(-)
diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 36d073d4a70..d7cd3110281 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -36,6 +36,7 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``include_start`` and ``include_end`` parameters from
``Series.between_time`` have been removed from pandas API on Spark, use
``inclusive`` instead.
* In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``,
``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas
API on Spark.
* In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and
`Series.plot`` has been removed from pandas API on Spark.
+* In Spark 4.0, the default value of ``regex`` parameter for
``Series.str.replace`` has been changed from ``True`` to ``False`` from pandas
API on Spark. Additionally, a single character ``pat`` with ``regex=True`` is
now treated as a regular expression instead of a string literal.
Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index 7c489dea2e3..cd47de55108 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -18,7 +18,6 @@
"""
String functions on pandas-on-Spark Series
"""
-import warnings
from typing import (
Any,
Callable,
@@ -1516,7 +1515,7 @@ class StringMethods:
n: int = -1,
case: Optional[bool] = None,
flags: int = 0,
- regex: bool = True,
+ regex: bool = False,
) -> "ps.Series":
"""
Replace occurrences of pattern/regex in the Series with some other
@@ -1580,7 +1579,7 @@ class StringMethods:
Reverse every lowercase alphabetic word:
>>> repl = lambda m: m.group(0)[::-1]
- >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+',
repl)
+ >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace('[a-z]+',
repl, regex=True)
0 oof 123
1 rab zab
2 None
@@ -1588,9 +1587,9 @@ class StringMethods:
Using regex groups (extract second group and swap case):
- >>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
+ >>> pat = "(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
>>> repl = lambda m: m.group('two').swapcase()
- >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
+ >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl,
regex=True)
0 tWO
1 bAR
dtype: object
@@ -1598,17 +1597,13 @@ class StringMethods:
Using a compiled regex with flags:
>>> import re
- >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
- >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
+ >>> regex_pat = re.compile('FUZ', flags=re.IGNORECASE)
+ >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar',
regex=True)
0 foo
1 bar
2 None
dtype: object
"""
- warnings.warn(
- "Default value of `regex` will be changed to `False` instead of
`True` in 4.0.0.",
- FutureWarning,
- )
def pandas_replace(s) -> ps.Series[str]: # type:
ignore[no-untyped-def]
return s.str.replace(pat, repl, n=n, case=case, flags=flags,
regex=regex)
@@ -2027,7 +2022,7 @@ class StringMethods:
@pandas_udf(returnType=return_type) # type: ignore[call-overload]
def pudf(s: pd.Series) -> pd.Series:
- return s.str.split(pat, n)
+ return s.str.split(pat, n=n)
psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
@@ -2174,7 +2169,7 @@ class StringMethods:
@pandas_udf(returnType=return_type) # type: ignore[call-overload]
def pudf(s: pd.Series) -> pd.Series:
- return s.str.rsplit(pat, n)
+ return s.str.rsplit(pat, n=n)
psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
diff --git a/python/pyspark/pandas/tests/test_series_string.py
b/python/pyspark/pandas/tests/test_series_string.py
index 3c2bd58da1a..93c6473f7d3 100644
--- a/python/pyspark/pandas/tests/test_series_string.py
+++ b/python/pyspark/pandas/tests/test_series_string.py
@@ -246,10 +246,6 @@ class SeriesStringTestsMixin:
with self.assertRaises(TypeError):
self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5,
6, 7, 8, 9]))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43476): Enable SeriesStringTests.test_string_replace for
pandas 2.0.0.",
- )
def test_string_replace(self):
self.check_func(lambda x: x.str.replace("a.", "xx", regex=True))
self.check_func(lambda x: x.str.replace("a.", "xx", regex=False))
@@ -259,10 +255,10 @@ class SeriesStringTestsMixin:
def repl(m):
return m.group(0)[::-1]
- self.check_func(lambda x: x.str.replace(r"[a-z]+", repl))
+ self.check_func(lambda x: x.str.replace("[a-z]+", repl, regex=True))
# compiled regex with flags
- regex_pat = re.compile(r"WHITESPACE", flags=re.IGNORECASE)
- self.check_func(lambda x: x.str.replace(regex_pat, "---"))
+ regex_pat = re.compile("WHITESPACE", flags=re.IGNORECASE)
+ self.check_func(lambda x: x.str.replace(regex_pat, "---", regex=True))
def test_string_rfind(self):
self.check_func(lambda x: x.str.rfind("a"))
@@ -297,10 +293,6 @@ class SeriesStringTestsMixin:
self.check_func(lambda x: x.str.slice_replace(stop=2, repl="X"))
self.check_func(lambda x: x.str.slice_replace(start=1, stop=3,
repl="X"))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43478): Enable SeriesStringTests.test_string_split for
pandas 2.0.0.",
- )
def test_string_split(self):
self.check_func_on_series(lambda x: repr(x.str.split()),
self.pser[:-1])
self.check_func_on_series(lambda x: repr(x.str.split(r"p*")),
self.pser[:-1])
@@ -311,10 +303,6 @@ class SeriesStringTestsMixin:
with self.assertRaises(NotImplementedError):
self.check_func(lambda x: x.str.split(expand=True))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43477): Enable SeriesStringTests.test_string_rsplit for
pandas 2.0.0.",
- )
def test_string_rsplit(self):
self.check_func_on_series(lambda x: repr(x.str.rsplit()),
self.pser[:-1])
self.check_func_on_series(lambda x: repr(x.str.rsplit(r"p*")),
self.pser[:-1])
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]