This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b804b53e4a5 [SPARK-41659][CONNECT] Enable doctests in pyspark.sql.connect.readwriter b804b53e4a5 is described below commit b804b53e4a5b971bf2db676f4d552fff383ab586 Author: Sandeep Singh <sand...@techaddict.me> AuthorDate: Tue Jan 3 09:41:57 2023 +0900 [SPARK-41659][CONNECT] Enable doctests in pyspark.sql.connect.readwriter ### What changes were proposed in this pull request? This PR proposes to enable doctests in pyspark.sql.connect.readwriter that is virtually the same as pyspark.sql.readwriter. ### Why are the changes needed? To make sure on the PySpark compatibility and test coverage. ### Does this PR introduce any user-facing change? No, doctest's only. ### How was this patch tested? New Doctests Added Closes #39331 from techaddict/SPARK-41659-pyspark.sql.connect.readwriter. Authored-by: Sandeep Singh <sand...@techaddict.me> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 1 + python/pyspark/sql/connect/plan.py | 2 +- python/pyspark/sql/connect/readwriter.py | 60 ++++++++++++++++++++++++++++++++ python/pyspark/sql/readwriter.py | 18 +++++----- 4 files changed, 71 insertions(+), 10 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 99f1cc6894f..0eeb3dd9218 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -509,6 +509,7 @@ pyspark_connect = Module( "pyspark.sql.connect.session", "pyspark.sql.connect.window", "pyspark.sql.connect.column", + "pyspark.sql.connect.readwriter", # unittests "pyspark.sql.tests.connect.test_connect_plan", "pyspark.sql.tests.connect.test_connect_basic", diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index f10687cc82e..48a8fa598e7 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -1264,7 +1264,7 @@ class WriteOperation(LogicalPlan): for k in self.options: if self.options[k] is None: - del plan.write_operation.options[k] + plan.write_operation.options.pop(k, None) else: plan.write_operation.options[k] = cast(str, self.options[k]) diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py index c3af39f864c..207ad8df74f 100644 --- a/python/pyspark/sql/connect/readwriter.py +++ b/python/pyspark/sql/connect/readwriter.py @@ -20,6 +20,7 @@ from typing import Dict from typing import Optional, Union, List, overload, Tuple, cast, Any from typing import TYPE_CHECKING +from pyspark import SparkContext, SparkConf from pyspark.sql.connect.plan import Read, DataSource, LogicalPlan, WriteOperation from pyspark.sql.types import StructType from pyspark.sql.utils import to_str @@ -458,3 +459,62 @@ class DataFrameWriter(OptionUtils): def jdbc(self, *args: Any, **kwargs: Any) -> None: raise NotImplementedError("jdbc() not supported for DataFrameWriter") + + +def _test() -> None: + import os + import sys + import doctest + from pyspark.sql import SparkSession as PySparkSession + from pyspark.testing.connectutils import should_test_connect, connect_requirement_message + + os.chdir(os.environ["SPARK_HOME"]) + + if should_test_connect: + import pyspark.sql.connect.readwriter + + globs = pyspark.sql.connect.readwriter.__dict__.copy() + # Works around to create a regular Spark session + sc = SparkContext("local[4]", "sql.connect.readwriter tests", conf=SparkConf()) + globs["_spark"] = PySparkSession( + sc, options={"spark.app.name": "sql.connect.readwriter tests"} + ) + + # TODO(SPARK-41817): Support reading with schema + del pyspark.sql.connect.readwriter.DataFrameReader.load.__doc__ + del pyspark.sql.connect.readwriter.DataFrameReader.option.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.csv.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.option.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.text.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.bucketBy.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.sortBy.__doc__ + + # TODO(SPARK-41818): Support saveAsTable + del pyspark.sql.connect.readwriter.DataFrameWriter.insertInto.__doc__ + del pyspark.sql.connect.readwriter.DataFrameWriter.saveAsTable.__doc__ + + # Creates a remote Spark session. + os.environ["SPARK_REMOTE"] = "sc://localhost" + globs["spark"] = PySparkSession.builder.remote("sc://localhost").getOrCreate() + + (failure_count, test_count) = doctest.testmod( + pyspark.sql.connect.readwriter, + globs=globs, + optionflags=doctest.ELLIPSIS + | doctest.NORMALIZE_WHITESPACE + | doctest.IGNORE_EXCEPTION_DETAIL, + ) + + globs["spark"].stop() + globs["_spark"].stop() + if failure_count: + sys.exit(-1) + else: + print( + f"Skipping pyspark.sql.connect.readwriter doctests: {connect_requirement_message}", + file=sys.stderr, + ) + + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index cc50d7cb0a5..8ecb0cd6dcd 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -90,7 +90,7 @@ class DataFrameReader(OptionUtils): Examples -------- >>> spark.read.format('json') - <pyspark.sql.readwriter.DataFrameReader object ...> + <...readwriter.DataFrameReader object ...> Write a DataFrame into a JSON file and read it back. @@ -133,7 +133,7 @@ class DataFrameReader(OptionUtils): Examples -------- >>> spark.read.schema("col0 INT, col1 DOUBLE") - <pyspark.sql.readwriter.DataFrameReader object ...> + <...readwriter.DataFrameReader object ...> Specify the schema with reading a CSV file. @@ -177,7 +177,7 @@ class DataFrameReader(OptionUtils): >>> from pyspark.sql import SparkSession >>> spark = SparkSession.builder.master("local").getOrCreate() >>> spark.read.option("key", "value") - <pyspark.sql.readwriter.DataFrameReader object ...> + <...readwriter.DataFrameReader object ...> Specify the option 'nullValue' with reading a CSV file. @@ -216,7 +216,7 @@ class DataFrameReader(OptionUtils): Examples -------- >>> spark.read.option("key", "value") - <pyspark.sql.readwriter.DataFrameReader object ...> + <...readwriter.DataFrameReader object ...> Specify the option 'nullValue' and 'header' with reading a CSV file. @@ -962,10 +962,10 @@ class DataFrameWriter(OptionUtils): >>> with tempfile.TemporaryDirectory() as d: ... spark.createDataFrame( ... [{"age": 80, "name": "Xinrong Meng"}] - ... ).write.mode("error").format("parquet").save(d) + ... ).write.mode("error").format("parquet").save(d) # doctest: +SKIP Traceback (most recent call last): ... - pyspark.sql.utils.AnalysisException: ... + ...AnalysisException: ... Write a Parquet file back with various options, and read it back. @@ -1016,7 +1016,7 @@ class DataFrameWriter(OptionUtils): Examples -------- >>> spark.range(1).write.format('parquet') - <pyspark.sql.readwriter.DataFrameWriter object ...> + <...readwriter.DataFrameWriter object ...> Write a DataFrame into a Parquet file and read it back. @@ -1057,7 +1057,7 @@ class DataFrameWriter(OptionUtils): Examples -------- >>> spark.range(1).write.option("key", "value") - <pyspark.sql.readwriter.DataFrameWriter object ...> + <...readwriter.DataFrameWriter object ...> Specify the option 'nullValue' with writing a CSV file. @@ -1096,7 +1096,7 @@ class DataFrameWriter(OptionUtils): Examples -------- >>> spark.range(1).write.option("key", "value") - <pyspark.sql.readwriter.DataFrameWriter object ...> + <...readwriter.DataFrameWriter object ...> Specify the option 'nullValue' and 'header' with writing a CSV file. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org