This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1c3f618ee38 [SPARK-43872][PS] Support `(DataFrame|Series).plot` with
pandas 2.0.0 and above
1c3f618ee38 is described below
commit 1c3f618ee388e0830c74117b872144303f40cebf
Author: itholic <[email protected]>
AuthorDate: Fri Aug 11 11:10:14 2023 +0900
[SPARK-43872][PS] Support `(DataFrame|Series).plot` with pandas 2.0.0 and
above
### What changes were proposed in this pull request?
This PR proposes to remove parameter `sort_columns` from
`(DataFrame|Series).plot` to support pandas 2.0.0.
Also enabling the multiple plot tests:
- test_area_plot
- test_area_plot_stacked_false
- test_area_plot_y
- test_bar_plot
- test_bar_with_x_y
- test_barh_plot_with_x_y
- test_barh_plot
- test_line_plot
- test_pie_plot
- test_scatter_plot
- test_hist_plot
- test_kde_plot
### Why are the changes needed?
To support pandas 2.0.0 & match the behavior.
### Does this PR introduce _any_ user-facing change?
`sort_columns` will no longer available.
### How was this patch tested?
Closes #42390 from itholic/remove_sort_columns.
Lead-authored-by: itholic <[email protected]>
Co-authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../source/migration_guide/pyspark_upgrade.rst | 1 +
python/pyspark/pandas/plot/matplotlib.py | 13 -----
.../tests/plot/test_frame_plot_matplotlib.py | 56 ----------------------
3 files changed, 1 insertion(+), 69 deletions(-)
diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 98630133e0c..36d073d4a70 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -35,6 +35,7 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``include_start`` and ``include_end`` parameters from
``DataFrame.between_time`` have been removed from pandas API on Spark, use
``inclusive`` instead.
* In Spark 4.0, ``include_start`` and ``include_end`` parameters from
``Series.between_time`` have been removed from pandas API on Spark, use
``inclusive`` instead.
* In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``,
``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas
API on Spark.
+* In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and
`Series.plot`` has been removed from pandas API on Spark.
Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/plot/matplotlib.py
b/python/pyspark/pandas/plot/matplotlib.py
index 39e862bbae8..36cfc759f83 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -15,7 +15,6 @@
# limitations under the License.
#
-import warnings
from distutils.version import LooseVersion
import matplotlib as mat
@@ -750,7 +749,6 @@ def plot_frame(
yerr=None,
xerr=None,
secondary_y=False,
- sort_columns=False,
**kwds,
):
"""
@@ -836,11 +834,6 @@ def plot_frame(
mark_right : boolean, default True
When using a secondary_y axis, automatically mark the column
labels with "(right)" in the legend
- sort_columns: bool, default is False
- When True, will sort values on plots.
-
- .. deprecated:: 3.4.0
-
**kwds : keywords
Options to pass to matplotlib plotting method
@@ -856,11 +849,6 @@ def plot_frame(
for bar plot layout by `position` keyword.
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
"""
- warnings.warn(
- "Argument `sort_columns` will be removed in 4.0.0.",
- FutureWarning,
- )
-
return _plot(
data,
kind=kind,
@@ -891,7 +879,6 @@ def plot_frame(
sharey=sharey,
secondary_y=secondary_y,
layout=layout,
- sort_columns=sort_columns,
**kwds,
)
diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
index a47968597b4..365d34b1f55 100644
--- a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
+++ b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py
@@ -18,7 +18,6 @@
import base64
from io import BytesIO
import unittest
-from distutils.version import LooseVersion
import pandas as pd
import numpy as np
@@ -79,11 +78,6 @@ class DataFramePlotMatplotlibTestsMixin:
plt.close(ax.figure)
return b64_data
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43641): Enable DataFramePlotMatplotlibTests.test_line_plot
"
- "for pandas 2.0.0.",
- )
def test_line_plot(self):
def check_line_plot(pdf, psdf):
ax1 = pdf.plot(kind="line", colormap="Paired")
@@ -108,10 +102,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_line_plot(pdf1, psdf1)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43634): Enable DataFramePlotMatplotlibTests.test_area_plot
for pandas 2.0.0.",
- )
def test_area_plot(self):
def check_area_plot(pdf, psdf):
ax1 = pdf.plot(kind="area", colormap="Paired")
@@ -136,11 +126,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf.columns = columns
check_area_plot(pdf, psdf)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43635): Enable
DataFramePlotMatplotlibTests.test_area_plot_stacked_false "
- "for pandas 2.0.0.",
- )
def test_area_plot_stacked_false(self):
def check_area_plot_stacked_false(pdf, psdf):
ax1 = pdf.plot.area(stacked=False)
@@ -168,11 +153,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf.columns = columns
check_area_plot_stacked_false(pdf, psdf)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43636): Enable
DataFramePlotMatplotlibTests.test_area_plot_y "
- "for pandas 2.0.0.",
- )
def test_area_plot_y(self):
def check_area_plot_y(pdf, psdf, y):
ax1 = pdf.plot.area(y=y)
@@ -199,11 +179,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf.columns = columns
check_area_plot_y(pdf, psdf, y=("x", "sales"))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43639): Enable
DataFramePlotMatplotlibTests.test_barh_plot_with_x_y "
- "for pandas 2.0.0.",
- )
def test_barh_plot_with_x_y(self):
def check_barh_plot_with_x_y(pdf, psdf, x, y):
ax1 = pdf.plot(kind="barh", x=x, y=y, colormap="Paired")
@@ -229,11 +204,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_barh_plot_with_x_y(pdf1, psdf1, x=("x", "lab"), y=("y", "val"))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43640): Enable DataFramePlotMatplotlibTests.test_barh_plot
"
- "for pandas 2.0.0.",
- )
def test_barh_plot(self):
def check_barh_plot(pdf, psdf):
ax1 = pdf.plot(kind="barh", colormap="Paired")
@@ -259,10 +229,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_barh_plot(pdf1, psdf1)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43637): Enable DataFramePlotMatplotlibTests.test_bar_plot
" "for pandas 2.0.0.",
- )
def test_bar_plot(self):
def check_bar_plot(pdf, psdf):
ax1 = pdf.plot(kind="bar", colormap="Paired")
@@ -287,11 +253,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_bar_plot(pdf1, psdf1)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43638): Enable
DataFramePlotMatplotlibTests.test_bar_with_x_y "
- "for pandas 2.0.0.",
- )
def test_bar_with_x_y(self):
# this is testing plot with specified x and y
pdf = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
@@ -326,10 +287,6 @@ class DataFramePlotMatplotlibTestsMixin:
bin8 = self.plot_to_base64(ax8)
self.assertEqual(bin7, bin8)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43642): Enable DataFramePlotMatplotlibTests.test_pie_plot
" "for pandas 2.0.0.",
- )
def test_pie_plot(self):
def check_pie_plot(pdf, psdf, y):
ax1 = pdf.plot.pie(y=y, figsize=(5, 5), colormap="Paired")
@@ -391,11 +348,6 @@ class DataFramePlotMatplotlibTestsMixin:
error_message = "pie requires either y column or 'subplots=True'"
self.assertTrue(error_message in str(context.exception))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43643): Enable
DataFramePlotMatplotlibTests.test_scatter_plot "
- "for pandas 2.0.0.",
- )
def test_scatter_plot(self):
def check_scatter_plot(pdf, psdf, x, y, c):
ax1 = pdf.plot.scatter(x=x, y=y)
@@ -428,10 +380,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_scatter_plot(pdf1, psdf1, x=("x", "a"), y=("x", "b"), c=("y",
"c"))
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43720): Enable DataFramePlotMatplotlibTests.test_hist_plot
for pandas 2.0.0.",
- )
def test_hist_plot(self):
def check_hist_plot(pdf, psdf):
_, ax1 = plt.subplots(1, 1)
@@ -483,10 +431,6 @@ class DataFramePlotMatplotlibTestsMixin:
psdf1.columns = columns
check_hist_plot(pdf1, psdf1)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-43722): Enable DataFramePlotMatplotlibTests.test_kde_plot
for pandas 2.0.0.",
- )
def test_kde_plot(self):
def moving_average(a, n=10):
ret = np.cumsum(a, dtype=float)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]