[spark] branch master updated: [SPARK-43476][SPARK-43477][SPARK-43478][PS] Support `StringMethods` for pandas 2.0.0 and above

ruifengz Fri, 11 Aug 2023 02:46:59 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 6729f497bc1 [SPARK-43476][SPARK-43477][SPARK-43478][PS] Support 
`StringMethods` for pandas 2.0.0 and above
6729f497bc1 is described below

commit 6729f497bc191f827c743b043b2aa889a707c295
Author: itholic <[email protected]>
AuthorDate: Fri Aug 11 17:46:00 2023 +0800

    [SPARK-43476][SPARK-43477][SPARK-43478][PS] Support `StringMethods` for 
pandas 2.0.0 and above
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to support `StringMethods` for pandas 2.0.0 and above.
    
    ### Why are the changes needed?
    
    Support the latest pandas for pandas API on Spark.
    
    ### Does this PR introduce _any_ user-facing change?
    
    `StringMethods.split`, `StringMethods.rsplit` and `StringMethods.replace` 
is available with the latest pandas.
    
    ### How was this patch tested?
    
    Enabling the existing UTs.
    
    Closes #42312 from itholic/pandas_str_split.
    
    Lead-authored-by: itholic <[email protected]>
    Co-authored-by: Haejoon Lee <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../docs/source/migration_guide/pyspark_upgrade.rst |  1 +
 python/pyspark/pandas/strings.py                    | 21 ++++++++-------------
 python/pyspark/pandas/tests/test_series_string.py   | 18 +++---------------
 3 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst 
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 36d073d4a70..d7cd3110281 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -36,6 +36,7 @@ Upgrading from PySpark 3.5 to 4.0
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from 
``Series.between_time`` have been removed from pandas API on Spark, use 
``inclusive`` instead.
 * In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, 
``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas 
API on Spark.
 * In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and 
`Series.plot`` has been removed from pandas API on Spark.
+* In Spark 4.0, the default value of ``regex`` parameter for 
``Series.str.replace`` has been changed from ``True`` to ``False`` from pandas 
API on Spark. Additionally, a single character ``pat`` with ``regex=True`` is 
now treated as a regular expression instead of a string literal.
 
 
 Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index 7c489dea2e3..cd47de55108 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -18,7 +18,6 @@
 """
 String functions on pandas-on-Spark Series
 """
-import warnings
 from typing import (
     Any,
     Callable,
@@ -1516,7 +1515,7 @@ class StringMethods:
         n: int = -1,
         case: Optional[bool] = None,
         flags: int = 0,
-        regex: bool = True,
+        regex: bool = False,
     ) -> "ps.Series":
         """
         Replace occurrences of pattern/regex in the Series with some other
@@ -1580,7 +1579,7 @@ class StringMethods:
         Reverse every lowercase alphabetic word:
 
         >>> repl = lambda m: m.group(0)[::-1]
-        >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', 
repl)
+        >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace('[a-z]+', 
repl, regex=True)
         0    oof 123
         1    rab zab
         2       None
@@ -1588,9 +1587,9 @@ class StringMethods:
 
         Using regex groups (extract second group and swap case):
 
-        >>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
+        >>> pat = "(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
         >>> repl = lambda m: m.group('two').swapcase()
-        >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
+        >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl, 
regex=True)
         0    tWO
         1    bAR
         dtype: object
@@ -1598,17 +1597,13 @@ class StringMethods:
         Using a compiled regex with flags:
 
         >>> import re
-        >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
-        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
+        >>> regex_pat = re.compile('FUZ', flags=re.IGNORECASE)
+        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', 
regex=True)
         0     foo
         1     bar
         2    None
         dtype: object
         """
-        warnings.warn(
-            "Default value of `regex` will be changed to `False` instead of 
`True` in 4.0.0.",
-            FutureWarning,
-        )
 
         def pandas_replace(s) -> ps.Series[str]:  # type: 
ignore[no-untyped-def]
             return s.str.replace(pat, repl, n=n, case=case, flags=flags, 
regex=regex)
@@ -2027,7 +2022,7 @@ class StringMethods:
 
         @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
-            return s.str.split(pat, n)
+            return s.str.split(pat, n=n)
 
         psser = self._data._with_new_scol(
             
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
@@ -2174,7 +2169,7 @@ class StringMethods:
 
         @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
-            return s.str.rsplit(pat, n)
+            return s.str.rsplit(pat, n=n)
 
         psser = self._data._with_new_scol(
             
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
diff --git a/python/pyspark/pandas/tests/test_series_string.py 
b/python/pyspark/pandas/tests/test_series_string.py
index 3c2bd58da1a..93c6473f7d3 100644
--- a/python/pyspark/pandas/tests/test_series_string.py
+++ b/python/pyspark/pandas/tests/test_series_string.py
@@ -246,10 +246,6 @@ class SeriesStringTestsMixin:
         with self.assertRaises(TypeError):
             self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 
6, 7, 8, 9]))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43476): Enable SeriesStringTests.test_string_replace for 
pandas 2.0.0.",
-    )
     def test_string_replace(self):
         self.check_func(lambda x: x.str.replace("a.", "xx", regex=True))
         self.check_func(lambda x: x.str.replace("a.", "xx", regex=False))
@@ -259,10 +255,10 @@ class SeriesStringTestsMixin:
         def repl(m):
             return m.group(0)[::-1]
 
-        self.check_func(lambda x: x.str.replace(r"[a-z]+", repl))
+        self.check_func(lambda x: x.str.replace("[a-z]+", repl, regex=True))
         # compiled regex with flags
-        regex_pat = re.compile(r"WHITESPACE", flags=re.IGNORECASE)
-        self.check_func(lambda x: x.str.replace(regex_pat, "---"))
+        regex_pat = re.compile("WHITESPACE", flags=re.IGNORECASE)
+        self.check_func(lambda x: x.str.replace(regex_pat, "---", regex=True))
 
     def test_string_rfind(self):
         self.check_func(lambda x: x.str.rfind("a"))
@@ -297,10 +293,6 @@ class SeriesStringTestsMixin:
         self.check_func(lambda x: x.str.slice_replace(stop=2, repl="X"))
         self.check_func(lambda x: x.str.slice_replace(start=1, stop=3, 
repl="X"))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43478): Enable SeriesStringTests.test_string_split for 
pandas 2.0.0.",
-    )
     def test_string_split(self):
         self.check_func_on_series(lambda x: repr(x.str.split()), 
self.pser[:-1])
         self.check_func_on_series(lambda x: repr(x.str.split(r"p*")), 
self.pser[:-1])
@@ -311,10 +303,6 @@ class SeriesStringTestsMixin:
         with self.assertRaises(NotImplementedError):
             self.check_func(lambda x: x.str.split(expand=True))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43477): Enable SeriesStringTests.test_string_rsplit for 
pandas 2.0.0.",
-    )
     def test_string_rsplit(self):
         self.check_func_on_series(lambda x: repr(x.str.rsplit()), 
self.pser[:-1])
         self.check_func_on_series(lambda x: repr(x.str.rsplit(r"p*")), 
self.pser[:-1])


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-43476][SPARK-43477][SPARK-43478][PS] Support `StringMethods` for pandas 2.0.0 and above

Reply via email to