This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 840620fbda1 [SPARK-41656][CONNECT][TESTS] Enable doctests in
pyspark.sql.connect.dataframe
840620fbda1 is described below
commit 840620fbda1fa7b01c8bea8d327a8b5d96f9f9ad
Author: Sandeep Singh <[email protected]>
AuthorDate: Tue Jan 3 11:10:11 2023 +0900
[SPARK-41656][CONNECT][TESTS] Enable doctests in
pyspark.sql.connect.dataframe
### What changes were proposed in this pull request?
This PR proposes to enable doctests in pyspark.sql.connect.dataframe that
is virtually the same as pyspark.sql.dataframe.
### Why are the changes needed?
To make sure on the PySpark compatibility and test coverage.
### Does this PR introduce any user-facing change?
No, doctest's only.
### How was this patch tested?
New Doctests Added
Closes #39346 from techaddict/SPARK-41656-pyspark.sql.connect.dataframe.
Authored-by: Sandeep Singh <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 1 +
python/pyspark/sql/connect/dataframe.py | 99 ++++++++++++++++++++++++++++++++-
python/pyspark/sql/dataframe.py | 10 ++--
3 files changed, 104 insertions(+), 6 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 0eeb3dd9218..2c399174c13 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -510,6 +510,7 @@ pyspark_connect = Module(
"pyspark.sql.connect.window",
"pyspark.sql.connect.column",
"pyspark.sql.connect.readwriter",
+ "pyspark.sql.connect.dataframe",
# unittests
"pyspark.sql.tests.connect.test_connect_plan",
"pyspark.sql.tests.connect.test_connect_basic",
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 95582e86390..0a69b6317f8 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -35,7 +35,7 @@ import pandas
import warnings
from collections.abc import Iterable
-from pyspark import _NoValue
+from pyspark import _NoValue, SparkContext, SparkConf
from pyspark._globals import _NoValueType
from pyspark.sql.types import DataType, StructType, Row
@@ -1373,3 +1373,100 @@ class DataFrameStatFunctions:
DataFrameStatFunctions.__doc__ = PySparkDataFrameStatFunctions.__doc__
+
+
+def _test() -> None:
+ import os
+ import sys
+ import doctest
+ from pyspark.sql import SparkSession as PySparkSession
+ from pyspark.testing.connectutils import should_test_connect,
connect_requirement_message
+
+ os.chdir(os.environ["SPARK_HOME"])
+
+ if should_test_connect:
+ import pyspark.sql.connect.dataframe
+
+ globs = pyspark.sql.connect.dataframe.__dict__.copy()
+ # Works around to create a regular Spark session
+ sc = SparkContext("local[4]", "sql.connect.dataframe tests",
conf=SparkConf())
+ globs["_spark"] = PySparkSession(
+ sc, options={"spark.app.name": "sql.connect.dataframe tests"}
+ )
+
+ # TODO(SPARK-41819): Implement RDD.getNumPartitions
+ del pyspark.sql.connect.dataframe.DataFrame.coalesce.__doc__
+ del pyspark.sql.connect.dataframe.DataFrame.repartition.__doc__
+
+ # TODO(SPARK-41820): Fix SparkConnectException: requirement failed
+ del
pyspark.sql.connect.dataframe.DataFrame.createOrReplaceGlobalTempView.__doc__
+ del
pyspark.sql.connect.dataframe.DataFrame.createOrReplaceTempView.__doc__
+
+ # TODO(SPARK-41821): Fix DataFrame.describe
+ del pyspark.sql.connect.dataframe.DataFrame.describe.__doc__
+
+ # TODO(SPARK-41823): ambiguous column names
+ del pyspark.sql.connect.dataframe.DataFrame.drop.__doc__
+ del pyspark.sql.connect.dataframe.DataFrame.join.__doc__
+
+ # TODO(SPARK-41824): DataFrame.explain format is different
+ del pyspark.sql.connect.dataframe.DataFrame.explain.__doc__
+ del pyspark.sql.connect.dataframe.DataFrame.hint.__doc__
+
+ # TODO(SPARK-41825): Dataframe.show formatting int as double
+ del pyspark.sql.connect.dataframe.DataFrame.fillna.__doc__
+ del pyspark.sql.connect.dataframe.DataFrameNaFunctions.replace.__doc__
+ del pyspark.sql.connect.dataframe.DataFrameNaFunctions.fill.__doc__
+ del pyspark.sql.connect.dataframe.DataFrame.replace.__doc__
+ del pyspark.sql.connect.dataframe.DataFrame.intersect.__doc__
+
+ # TODO(SPARK-41826): Implement Dataframe.readStream
+ del pyspark.sql.connect.dataframe.DataFrame.isStreaming.__doc__
+
+ # TODO(SPARK-41827): groupBy requires all cols be Column or str
+ del pyspark.sql.connect.dataframe.DataFrame.groupBy.__doc__
+
+ # TODO(SPARK-41828): Implement creating empty DataFrame
+ del pyspark.sql.connect.dataframe.DataFrame.isEmpty.__doc__
+
+ # TODO(SPARK-41829): Add Dataframe sort ordering
+ del pyspark.sql.connect.dataframe.DataFrame.sort.__doc__
+ del
pyspark.sql.connect.dataframe.DataFrame.sortWithinPartitions.__doc__
+
+ # TODO(SPARK-41830): fix sample parameters
+ del pyspark.sql.connect.dataframe.DataFrame.sample.__doc__
+
+ # TODO(SPARK-41831): fix transform to accept ColumnReference
+ del pyspark.sql.connect.dataframe.DataFrame.transform.__doc__
+
+ # TODO(SPARK-41832): fix unionByName
+ del pyspark.sql.connect.dataframe.DataFrame.unionByName.__doc__
+
+ # TODO(SPARK-41818): Support saveAsTable
+ del pyspark.sql.connect.dataframe.DataFrame.write.__doc__
+
+ # Creates a remote Spark session.
+ os.environ["SPARK_REMOTE"] = "sc://localhost"
+ globs["spark"] =
PySparkSession.builder.remote("sc://localhost").getOrCreate()
+
+ (failure_count, test_count) = doctest.testmod(
+ pyspark.sql.connect.dataframe,
+ globs=globs,
+ optionflags=doctest.ELLIPSIS
+ | doctest.NORMALIZE_WHITESPACE
+ | doctest.IGNORE_EXCEPTION_DETAIL,
+ )
+
+ globs["spark"].stop()
+ globs["_spark"].stop()
+ if failure_count:
+ sys.exit(-1)
+ else:
+ print(
+ f"Skipping pyspark.sql.connect.dataframe doctests:
{connect_requirement_message}",
+ file=sys.stderr,
+ )
+
+
+if __name__ == "__main__":
+ _test()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7f05d49bded..e3646cd7d95 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -189,7 +189,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
--------
>>> df = spark.range(1)
>>> type(df.sparkSession)
- <class 'pyspark.sql.session.SparkSession'>
+ <class '...session.SparkSession'>
"""
return self._session
@@ -233,7 +233,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
--------
>>> df = spark.sql("SELECT 1 AS c1, int(NULL) AS c2")
>>> type(df.na)
- <class 'pyspark.sql.dataframe.DataFrameNaFunctions'>
+ <class '...dataframe.DataFrameNaFunctions'>
Replace the missing values as 2.
@@ -264,7 +264,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
>>> import pyspark.sql.functions as f
>>> df = spark.range(3).withColumn("c", f.expr("id + 1"))
>>> type(df.stat)
- <class 'pyspark.sql.dataframe.DataFrameStatFunctions'>
+ <class '...dataframe.DataFrameStatFunctions'>
>>> df.stat.corr("id", "c")
1.0
"""
@@ -355,7 +355,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
Throw an exception if the table already exists.
- >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL
+ >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL,
+SKIP
Traceback (most recent call last):
...
AnalysisException: "Temporary table 'people' already exists;"
@@ -438,7 +438,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
Throws an exception if the global temporary view already exists.
- >>> df.createGlobalTempView("people") # doctest:
+IGNORE_EXCEPTION_DETAIL
+ >>> df.createGlobalTempView("people") # doctest:
+IGNORE_EXCEPTION_DETAIL, +SKIP
Traceback (most recent call last):
...
AnalysisException: "Temporary table 'people' already exists;"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]