This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c41a5ff60a71 [SPARK-46439][PS][TESTS] Move IO-related tests to
`pyspark.pandas.tests.io.*`
c41a5ff60a71 is described below
commit c41a5ff60a71256256387bed5fc4f9537a6d93cb
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Dec 18 12:06:17 2023 -0800
[SPARK-46439][PS][TESTS] Move IO-related tests to
`pyspark.pandas.tests.io.*`
### What changes were proposed in this pull request?
Move IO-related tests to `pyspark.pandas.tests.io.*`
### Why are the changes needed?
test code clean up
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44395 from zhengruifeng/ps_test_mv_df_conv.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
dev/sparktestsupport/modules.py | 19 ++--
.../tests/connect/{ => io}/test_parity_csv.py | 11 +-
.../{ => io}/test_parity_dataframe_conversion.py | 14 +--
.../{ => io}/test_parity_dataframe_spark_io.py | 9 +-
.../pandas/tests/connect/io/test_parity_io.py | 11 +-
.../{ => io}/test_parity_series_conversion.py | 8 +-
python/pyspark/pandas/tests/{ => io}/test_csv.py | 116 ++++++++++-----------
.../tests/{ => io}/test_dataframe_conversion.py | 69 ++++++------
.../tests/{ => io}/test_dataframe_spark_io.py | 8 +-
python/pyspark/pandas/tests/io/test_io.py | 8 +-
.../tests/{ => io}/test_series_conversion.py | 8 +-
11 files changed, 152 insertions(+), 129 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 2db4974181ee..9877835fce00 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -723,9 +723,6 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
- "pyspark.pandas.tests.test_csv",
- "pyspark.pandas.tests.test_dataframe_conversion",
- "pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_default_index",
"pyspark.pandas.tests.window.test_expanding",
"pyspark.pandas.tests.window.test_expanding_adv",
@@ -769,7 +766,6 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.window.test_groupby_rolling_adv",
"pyspark.pandas.tests.window.test_groupby_rolling_count",
"pyspark.pandas.tests.test_scalars",
- "pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.series.test_string_ops_adv",
"pyspark.pandas.tests.series.test_string_ops_basic",
@@ -836,6 +832,10 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.frame.test_time_series",
"pyspark.pandas.tests.frame.test_truncate",
"pyspark.pandas.tests.io.test_io",
+ "pyspark.pandas.tests.io.test_csv",
+ "pyspark.pandas.tests.io.test_dataframe_conversion",
+ "pyspark.pandas.tests.io.test_dataframe_spark_io",
+ "pyspark.pandas.tests.io.test_series_conversion",
"pyspark.pandas.tests.groupby.test_aggregate",
"pyspark.pandas.tests.groupby.test_apply_func",
"pyspark.pandas.tests.groupby.test_cumulative",
@@ -1017,9 +1017,6 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly",
"pyspark.pandas.tests.connect.test_parity_categorical",
"pyspark.pandas.tests.connect.test_parity_config",
- "pyspark.pandas.tests.connect.test_parity_csv",
- "pyspark.pandas.tests.connect.test_parity_dataframe_conversion",
- "pyspark.pandas.tests.connect.test_parity_dataframe_spark_io",
"pyspark.pandas.tests.connect.test_parity_default_index",
"pyspark.pandas.tests.connect.test_parity_extension",
"pyspark.pandas.tests.connect.test_parity_frame_spark",
@@ -1034,7 +1031,6 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.resample.test_parity_on",
"pyspark.pandas.tests.connect.resample.test_parity_timezone",
"pyspark.pandas.tests.connect.test_parity_scalars",
- "pyspark.pandas.tests.connect.test_parity_series_conversion",
"pyspark.pandas.tests.connect.test_parity_series_datetime",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
@@ -1073,7 +1069,6 @@ pyspark_pandas_connect_part0 = Module(
],
)
-
pyspark_pandas_connect_part1 = Module(
name="pyspark-pandas-connect-part1",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
@@ -1090,7 +1085,6 @@ pyspark_pandas_connect_part1 = Module(
"pyspark.pandas.tests.connect.frame.test_parity_take",
"pyspark.pandas.tests.connect.frame.test_parity_time_series",
"pyspark.pandas.tests.connect.frame.test_parity_truncate",
- "pyspark.pandas.tests.connect.io.test_parity_io",
"pyspark.pandas.tests.connect.groupby.test_parity_aggregate",
"pyspark.pandas.tests.connect.groupby.test_parity_apply_func",
"pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
@@ -1194,6 +1188,11 @@ pyspark_pandas_connect_part3 = Module(
],
python_test_goals=[
# pandas-on-Spark unittests
+ "pyspark.pandas.tests.connect.io.test_parity_io",
+ "pyspark.pandas.tests.connect.io.test_parity_csv",
+ "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
+ "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
+ "pyspark.pandas.tests.connect.io.test_parity_series_conversion",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_at",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_between",
diff --git a/python/pyspark/pandas/tests/connect/test_parity_csv.py
b/python/pyspark/pandas/tests/connect/io/test_parity_csv.py
similarity index 82%
rename from python/pyspark/pandas/tests/connect/test_parity_csv.py
rename to python/pyspark/pandas/tests/connect/io/test_parity_csv.py
index 2b0c0af43e02..69c9ded9e00b 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_csv.py
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_csv.py
@@ -16,17 +16,22 @@
#
import unittest
-from pyspark.pandas.tests.test_csv import CsvTestsMixin
+from pyspark.pandas.tests.io.test_csv import CsvTestsMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
-class CsvParityTests(CsvTestsMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase):
+class CsvParityTests(
+ CsvTestsMixin,
+ PandasOnSparkTestUtils,
+ TestUtils,
+ ReusedConnectTestCase,
+):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_csv import * # noqa: F401
+ from pyspark.pandas.tests.connect.io.test_parity_csv import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py
b/python/pyspark/pandas/tests/connect/io/test_parity_dataframe_conversion.py
similarity index 76%
rename from
python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py
rename to
python/pyspark/pandas/tests/connect/io/test_parity_dataframe_conversion.py
index c5a26a002f91..6be31da35f29 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_dataframe_conversion.py
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_dataframe_conversion.py
@@ -16,22 +16,22 @@
#
import unittest
-from pyspark import pandas as ps
-from pyspark.pandas.tests.test_dataframe_conversion import
DataFrameConversionTestsMixin
+from pyspark.pandas.tests.io.test_dataframe_conversion import
DataFrameConversionMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
class DataFrameConversionParityTests(
- DataFrameConversionTestsMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase, TestUtils
+ DataFrameConversionMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+ TestUtils,
):
- @property
- def psdf(self):
- return ps.from_pandas(self.pdf)
+ pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_dataframe_conversion import
* # noqa: F401
+ from pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion
import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py
b/python/pyspark/pandas/tests/connect/io/test_parity_dataframe_spark_io.py
similarity index 81%
rename from
python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py
rename to
python/pyspark/pandas/tests/connect/io/test_parity_dataframe_spark_io.py
index 3b700dd32af5..f9378f546c61 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_dataframe_spark_io.py
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_dataframe_spark_io.py
@@ -16,19 +16,22 @@
#
import unittest
-from pyspark.pandas.tests.test_dataframe_spark_io import
DataFrameSparkIOTestsMixin
+from pyspark.pandas.tests.io.test_dataframe_spark_io import
DataFrameSparkIOTestsMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
class DataFrameSparkIOParityTests(
- DataFrameSparkIOTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase,
TestUtils
+ DataFrameSparkIOTestsMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+ TestUtils,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_dataframe_spark_io import *
# noqa: F401
+ from pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io import
* # noqa: F401
try:
import xmlrunner
diff --git a/python/pyspark/pandas/tests/connect/io/test_parity_io.py
b/python/pyspark/pandas/tests/connect/io/test_parity_io.py
index 9aab9923c227..dfc955de6a2a 100644
--- a/python/pyspark/pandas/tests/connect/io/test_parity_io.py
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_io.py
@@ -16,16 +16,17 @@
#
import unittest
-from pyspark import pandas as ps
from pyspark.pandas.tests.io.test_io import FrameIOMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class FrameParityIOTests(FrameIOMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
- @property
- def psdf(self):
- return ps.from_pandas(self.pdf)
+class FrameParityIOTests(
+ FrameIOMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
if __name__ == "__main__":
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py
b/python/pyspark/pandas/tests/connect/io/test_parity_series_conversion.py
similarity index 82%
rename from python/pyspark/pandas/tests/connect/test_parity_series_conversion.py
rename to
python/pyspark/pandas/tests/connect/io/test_parity_series_conversion.py
index 6545b9627c33..78c8f5c9d6dc 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_series_conversion.py
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_series_conversion.py
@@ -16,19 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.test_series_conversion import
SeriesConversionTestsMixin
+from pyspark.pandas.tests.io.test_series_conversion import
SeriesConversionTestsMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
class SeriesConversionParityTests(
- SeriesConversionTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
+ SeriesConversionTestsMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_series_conversion import *
# noqa: F401
+ from pyspark.pandas.tests.connect.io.test_parity_series_conversion import
* # noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/test_csv.py
b/python/pyspark/pandas/tests/io/test_csv.py
similarity index 81%
rename from python/pyspark/pandas/tests/test_csv.py
rename to python/pyspark/pandas/tests/io/test_csv.py
index e35b49315712..a0e24571bf8c 100644
--- a/python/pyspark/pandas/tests/test_csv.py
+++ b/python/pyspark/pandas/tests/io/test_csv.py
@@ -16,9 +16,6 @@
#
import os
-import shutil
-import tempfile
-import unittest
from contextlib import contextmanager
import pandas as pd
@@ -33,12 +30,6 @@ def normalize_text(s):
class CsvTestsMixin:
- def setUp(self):
- self.tmp_dir = tempfile.mkdtemp(prefix=CsvTests.__name__)
-
- def tearDown(self):
- shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
@property
def csv_text(self):
return normalize_text(
@@ -331,94 +322,103 @@ class CsvTestsMixin:
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
psdf = ps.DataFrame(pdf)
- tmp_dir = "{}/tmp1".format(self.tmp_dir)
+ with self.temp_dir() as dirpath:
+ tmp_dir = "{}/tmp1".format(dirpath)
- psdf.to_csv(tmp_dir, num_files=1)
- self._check_output(tmp_dir, pdf.to_csv(index=False))
+ psdf.to_csv(tmp_dir, num_files=1)
+ self._check_output(tmp_dir, pdf.to_csv(index=False))
- tmp_dir = "{}/tmp2".format(self.tmp_dir)
+ tmp_dir = "{}/tmp2".format(dirpath)
- self.assertRaises(KeyError, lambda: psdf.to_csv(tmp_dir,
columns=["c"], num_files=1))
+ self.assertRaises(KeyError, lambda: psdf.to_csv(tmp_dir,
columns=["c"], num_files=1))
- # non-string names
- pdf = pd.DataFrame({10: [1, 2, 3], 20: ["a", "b", "c"]})
- psdf = ps.DataFrame(pdf)
+ # non-string names
+ pdf = pd.DataFrame({10: [1, 2, 3], 20: ["a", "b", "c"]})
+ psdf = ps.DataFrame(pdf)
- tmp_dir = "{}/tmp3".format(self.tmp_dir)
+ tmp_dir = "{}/tmp3".format(dirpath)
- psdf.to_csv(tmp_dir, num_files=1)
- self._check_output(tmp_dir, pdf.to_csv(index=False))
+ psdf.to_csv(tmp_dir, num_files=1)
+ self._check_output(tmp_dir, pdf.to_csv(index=False))
- tmp_dir = "{}/tmp4".format(self.tmp_dir)
+ tmp_dir = "{}/tmp4".format(dirpath)
- psdf.to_csv(tmp_dir, columns=[10], num_files=1)
- self._check_output(tmp_dir, pdf.to_csv(columns=[10], index=False))
+ psdf.to_csv(tmp_dir, columns=[10], num_files=1)
+ self._check_output(tmp_dir, pdf.to_csv(columns=[10], index=False))
- tmp_dir = "{}/tmp5".format(self.tmp_dir)
+ tmp_dir = "{}/tmp5".format(dirpath)
- self.assertRaises(TypeError, lambda: psdf.to_csv(tmp_dir, columns=10,
num_files=1))
+ self.assertRaises(TypeError, lambda: psdf.to_csv(tmp_dir,
columns=10, num_files=1))
def test_to_csv_with_path_and_basic_options(self):
pdf = pd.DataFrame({"aa": [1, 2, 3], "bb": ["a", "b", "c"]})
psdf = ps.DataFrame(pdf)
- psdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=False,
columns=["aa"])
- expected = pdf.to_csv(index=False, sep="|", header=False,
columns=["aa"])
+ with self.temp_dir() as dirpath:
+ psdf.to_csv(dirpath, num_files=1, sep="|", header=False,
columns=["aa"])
+ expected = pdf.to_csv(index=False, sep="|", header=False,
columns=["aa"])
- self._check_output(self.tmp_dir, expected)
+ self._check_output(dirpath, expected)
def test_to_csv_with_path_and_basic_options_multiindex_columns(self):
pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): ["a", "b",
"c"]})
psdf = ps.DataFrame(pdf)
- with self.assertRaises(ValueError):
- psdf.to_csv(self.tmp_dir, num_files=1, sep="|", columns=[("x",
"a")])
+ with self.temp_dir() as dirpath:
+ with self.assertRaises(ValueError):
+ psdf.to_csv(dirpath, num_files=1, sep="|", columns=[("x",
"a")])
- psdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=["a"],
columns=[("x", "a")])
- pdf.columns = ["a", "b"]
- expected = pdf.to_csv(index=False, sep="|", columns=["a"])
+ psdf.to_csv(dirpath, num_files=1, sep="|", header=["a"],
columns=[("x", "a")])
+ pdf.columns = ["a", "b"]
+ expected = pdf.to_csv(index=False, sep="|", columns=["a"])
- self._check_output(self.tmp_dir, expected)
+ self._check_output(dirpath, expected)
def test_to_csv_with_path_and_pyspark_options(self):
pdf = pd.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", "c", None]})
psdf = ps.DataFrame(pdf)
- psdf.to_csv(self.tmp_dir, nullValue="null", num_files=1)
- expected = pdf.to_csv(index=False, na_rep="null")
+ with self.temp_dir() as dirpath:
+ psdf.to_csv(dirpath, nullValue="null", num_files=1)
+ expected = pdf.to_csv(index=False, na_rep="null")
- self._check_output(self.tmp_dir, expected)
+ self._check_output(dirpath, expected)
def test_to_csv_with_partition_cols(self):
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
psdf = ps.DataFrame(pdf)
- psdf.to_csv(self.tmp_dir, partition_cols="b", num_files=1)
-
- partition_paths = [path for path in os.listdir(self.tmp_dir) if
path.startswith("b=")]
- assert len(partition_paths) > 0
- for partition_path in partition_paths:
- column, value = partition_path.split("=")
- expected = pdf[pdf[column] == value].drop("b",
axis=1).to_csv(index=False)
-
- output_paths = [
- path
- for path in os.listdir("%s/%s" % (self.tmp_dir,
partition_path))
- if path.startswith("part-")
- ]
- assert len(output_paths) > 0
- output_path = "%s/%s/%s" % (self.tmp_dir, partition_path,
output_paths[0])
- with open(output_path) as f:
- self.assertEqual(f.read(), expected)
-
-
-class CsvTests(CsvTestsMixin, PandasOnSparkTestCase, TestUtils):
+ with self.temp_dir() as dirpath:
+ psdf.to_csv(dirpath, partition_cols="b", num_files=1)
+
+ partition_paths = [path for path in os.listdir(dirpath) if
path.startswith("b=")]
+ assert len(partition_paths) > 0
+ for partition_path in partition_paths:
+ column, value = partition_path.split("=")
+ expected = pdf[pdf[column] == value].drop("b",
axis=1).to_csv(index=False)
+
+ output_paths = [
+ path
+ for path in os.listdir("%s/%s" % (dirpath, partition_path))
+ if path.startswith("part-")
+ ]
+ assert len(output_paths) > 0
+ output_path = "%s/%s/%s" % (dirpath, partition_path,
output_paths[0])
+ with open(output_path) as f:
+ self.assertEqual(f.read(), expected)
+
+
+class CsvTests(
+ CsvTestsMixin,
+ PandasOnSparkTestCase,
+ TestUtils,
+):
pass
if __name__ == "__main__":
import unittest
- from pyspark.pandas.tests.test_csv import * # noqa: F401
+ from pyspark.pandas.tests.io.test_csv import * # noqa: F401
try:
import xmlrunner
diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py
b/python/pyspark/pandas/tests/io/test_dataframe_conversion.py
similarity index 86%
rename from python/pyspark/pandas/tests/test_dataframe_conversion.py
rename to python/pyspark/pandas/tests/io/test_dataframe_conversion.py
index 134cf8bd1c10..d4b03a855d38 100644
--- a/python/pyspark/pandas/tests/test_dataframe_conversion.py
+++ b/python/pyspark/pandas/tests/io/test_dataframe_conversion.py
@@ -16,9 +16,7 @@
#
import os
-import shutil
import string
-import tempfile
import unittest
import sys
@@ -26,23 +24,21 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
-from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
from pyspark.testing.sqlutils import SQLTestUtils
-class DataFrameConversionTestsMixin:
+class DataFrameConversionMixin:
"""Test cases for "small data" conversion and I/O."""
- def setUp(self):
- self.tmp_dir =
tempfile.mkdtemp(prefix=DataFrameConversionTests.__name__)
-
- def tearDown(self):
- shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
@property
def pdf(self):
return pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3])
+ @property
+ def psdf(self):
+ return ps.from_pandas(self.pdf)
+
@staticmethod
def strip_all_whitespace(str):
"""A helper function to remove all whitespace from a string."""
@@ -158,34 +154,36 @@ class DataFrameConversionTestsMixin:
pdf = pd.DataFrame({"a": [1], "b": ["a"]})
psdf = ps.DataFrame(pdf)
- psdf.to_json(self.tmp_dir, num_files=1)
- expected = pdf.to_json(orient="records")
+ with self.temp_dir() as dirpath:
+ psdf.to_json(dirpath, num_files=1)
+ expected = pdf.to_json(orient="records")
- output_paths = [path for path in os.listdir(self.tmp_dir) if
path.startswith("part-")]
- assert len(output_paths) > 0
- output_path = "%s/%s" % (self.tmp_dir, output_paths[0])
- self.assertEqual("[%s]" % open(output_path).read().strip(), expected)
+ output_paths = [path for path in os.listdir(dirpath) if
path.startswith("part-")]
+ assert len(output_paths) > 0
+ output_path = "%s/%s" % (dirpath, output_paths[0])
+ self.assertEqual("[%s]" % open(output_path).read().strip(),
expected)
def test_to_json_with_partition_cols(self):
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
psdf = ps.DataFrame(pdf)
- psdf.to_json(self.tmp_dir, partition_cols="b", num_files=1)
-
- partition_paths = [path for path in os.listdir(self.tmp_dir) if
path.startswith("b=")]
- assert len(partition_paths) > 0
- for partition_path in partition_paths:
- column, value = partition_path.split("=")
- expected = pdf[pdf[column] == value].drop("b",
axis=1).to_json(orient="records")
-
- output_paths = [
- path
- for path in os.listdir("%s/%s" % (self.tmp_dir,
partition_path))
- if path.startswith("part-")
- ]
- assert len(output_paths) > 0
- output_path = "%s/%s/%s" % (self.tmp_dir, partition_path,
output_paths[0])
- self.assertEqual("[%s]" % open(output_path).read().strip(),
expected)
+ with self.temp_dir() as dirpath:
+ psdf.to_json(dirpath, partition_cols="b", num_files=1)
+
+ partition_paths = [path for path in os.listdir(dirpath) if
path.startswith("b=")]
+ assert len(partition_paths) > 0
+ for partition_path in partition_paths:
+ column, value = partition_path.split("=")
+ expected = pdf[pdf[column] == value].drop("b",
axis=1).to_json(orient="records")
+
+ output_paths = [
+ path
+ for path in os.listdir("%s/%s" % (dirpath, partition_path))
+ if path.startswith("part-")
+ ]
+ assert len(output_paths) > 0
+ output_path = "%s/%s/%s" % (dirpath, partition_path,
output_paths[0])
+ self.assertEqual("[%s]" % open(output_path).read().strip(),
expected)
@unittest.skipIf(
sys.platform == "linux" or sys.platform == "linux2",
@@ -258,13 +256,16 @@ class DataFrameConversionTestsMixin:
class DataFrameConversionTests(
- DataFrameConversionTestsMixin, ComparisonTestBase, SQLTestUtils, TestUtils
+ DataFrameConversionMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+ TestUtils,
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.test_dataframe_conversion import * # noqa: F401
+ from pyspark.pandas.tests.io.test_dataframe_conversion import * # noqa:
F401
try:
import xmlrunner
diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py
b/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py
similarity index 99%
rename from python/pyspark/pandas/tests/test_dataframe_spark_io.py
rename to python/pyspark/pandas/tests/io/test_dataframe_spark_io.py
index 020cded5a871..b8225b10f1c7 100644
--- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py
+++ b/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py
@@ -468,12 +468,16 @@ class DataFrameSparkIOTestsMixin:
)
-class DataFrameSparkIOTests(DataFrameSparkIOTestsMixin, PandasOnSparkTestCase,
TestUtils):
+class DataFrameSparkIOTests(
+ DataFrameSparkIOTestsMixin,
+ PandasOnSparkTestCase,
+ TestUtils,
+):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.test_dataframe_spark_io import * # noqa: F401
+ from pyspark.pandas.tests.io.test_dataframe_spark_io import * # noqa: F401
try:
import xmlrunner
diff --git a/python/pyspark/pandas/tests/io/test_io.py
b/python/pyspark/pandas/tests/io/test_io.py
index bb075ba4410a..d4e61319f229 100644
--- a/python/pyspark/pandas/tests/io/test_io.py
+++ b/python/pyspark/pandas/tests/io/test_io.py
@@ -24,7 +24,7 @@ import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import (
have_tabulate,
- ComparisonTestBase,
+ PandasOnSparkTestCase,
tabulate_requirement_message,
)
from pyspark.testing.sqlutils import SQLTestUtils
@@ -145,7 +145,11 @@ class FrameIOMixin:
self.assert_eq(pdf_io.getvalue().split("\n")[1:],
psdf_io.getvalue().split("\n")[1:])
-class FrameIOTests(FrameIOMixin, ComparisonTestBase, SQLTestUtils):
+class FrameIOTests(
+ FrameIOMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
pass
diff --git a/python/pyspark/pandas/tests/test_series_conversion.py
b/python/pyspark/pandas/tests/io/test_series_conversion.py
similarity index 93%
rename from python/pyspark/pandas/tests/test_series_conversion.py
rename to python/pyspark/pandas/tests/io/test_series_conversion.py
index 926c641ebc9c..2ae40e92b489 100644
--- a/python/pyspark/pandas/tests/test_series_conversion.py
+++ b/python/pyspark/pandas/tests/io/test_series_conversion.py
@@ -63,12 +63,16 @@ class SeriesConversionTestsMixin:
self.assert_eq(psser.to_latex(decimal=","), pser.to_latex(decimal=","))
-class SeriesConversionTests(SeriesConversionTestsMixin, PandasOnSparkTestCase,
SQLTestUtils):
+class SeriesConversionTests(
+ SeriesConversionTestsMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.test_series_conversion import * # noqa: F401
+ from pyspark.pandas.tests.io.test_series_conversion import * # noqa: F401
try:
import xmlrunner
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]