This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c3b849a16c3 [SPARK-41157][CONNECT][PYTHON][TEST] Show detailed
differences in dataframe comparison
c3b849a16c3 is described below
commit c3b849a16c3cde4527c12f5d9fdd23a50c242d0f
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Nov 16 18:01:07 2022 +0900
[SPARK-41157][CONNECT][PYTHON][TEST] Show detailed differences in dataframe
comparison
### What changes were proposed in this pull request?
use `assert_eq` in `PandasOnSparkTestCase` to compare dataframes
### Why are the changes needed?
show detailed error message
before:
```
======================================================================
ERROR [0.667s]: test_fill_na
(pyspark.sql.tests.connect.test_connect_basic.SparkConnectTests)
----------------------------------------------------------------------
Traceback (most recent call last):
File
"/home/jenkins/python/pyspark/sql/tests/connect/test_connect_basic.py", line
244, in test_fill_na
self.assertTrue(
AssertionError: False is not true
----------------------------------------------------------------------
```
after:
```
AssertionError: DataFrame.iloc[:, 0] (column name="id") are different
DataFrame.iloc[:, 0] (column name="id") values are different (100.0 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[left]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[right]: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Left:
id
id int64
dtype: object
Right:
id
id int64
dtype: object
```
### Does this PR introduce _any_ user-facing change?
No, test only
### How was this patch tested?
existing UT
Closes #38670 from zhengruifeng/connect_test_df_equal.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../sql/tests/connect/test_connect_basic.py | 56 +++++++++-------------
1 file changed, 23 insertions(+), 33 deletions(-)
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py
b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 90a534d599e..d31e472faec 100644
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -36,11 +36,12 @@ if have_pandas:
from pyspark.sql.connect.functions import lit
from pyspark.sql.dataframe import DataFrame
from pyspark.testing.connectutils import should_test_connect,
connect_requirement_message
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.utils import ReusedPySparkTestCase
@unittest.skipIf(not should_test_connect, connect_requirement_message)
-class SparkConnectSQLTestCase(ReusedPySparkTestCase, SQLTestUtils):
+class SparkConnectSQLTestCase(PandasOnSparkTestCase, ReusedPySparkTestCase,
SQLTestUtils):
"""Parent test fixture class for all Spark Connect related
test cases."""
@@ -192,20 +193,17 @@ class SparkConnectTests(SparkConnectSQLTestCase):
self.assertTrue("special_alias" in plan_text)
def test_range(self):
- self.assertTrue(
- self.connect.range(start=0, end=10)
- .toPandas()
- .equals(self.spark.range(start=0, end=10).toPandas())
+ self.assert_eq(
+ self.connect.range(start=0, end=10).toPandas(),
+ self.spark.range(start=0, end=10).toPandas(),
)
- self.assertTrue(
- self.connect.range(start=0, end=10, step=3)
- .toPandas()
- .equals(self.spark.range(start=0, end=10, step=3).toPandas())
+ self.assert_eq(
+ self.connect.range(start=0, end=10, step=3).toPandas(),
+ self.spark.range(start=0, end=10, step=3).toPandas(),
)
- self.assertTrue(
- self.connect.range(start=0, end=10, step=3, numPartitions=2)
- .toPandas()
- .equals(self.spark.range(start=0, end=10, step=3,
numPartitions=2).toPandas())
+ self.assert_eq(
+ self.connect.range(start=0, end=10, step=3,
numPartitions=2).toPandas(),
+ self.spark.range(start=0, end=10, step=3,
numPartitions=2).toPandas(),
)
def test_create_global_temp_view(self):
@@ -235,29 +233,21 @@ class SparkConnectTests(SparkConnectSQLTestCase):
# | null| 3| 3.0|
# +-----+----+----+
- self.assertTrue(
- self.connect.sql(query)
- .fillna(True)
- .toPandas()
- .equals(self.spark.sql(query).fillna(True).toPandas())
+ self.assert_eq(
+ self.connect.sql(query).fillna(True).toPandas(),
+ self.spark.sql(query).fillna(True).toPandas(),
)
- self.assertTrue(
- self.connect.sql(query)
- .fillna(2)
- .toPandas()
- .equals(self.spark.sql(query).fillna(2).toPandas())
+ self.assert_eq(
+ self.connect.sql(query).fillna(2).toPandas(),
+ self.spark.sql(query).fillna(2).toPandas(),
)
- self.assertTrue(
- self.connect.sql(query)
- .fillna(2, ["a", "b"])
- .toPandas()
- .equals(self.spark.sql(query).fillna(2, ["a", "b"]).toPandas())
+ self.assert_eq(
+ self.connect.sql(query).fillna(2, ["a", "b"]).toPandas(),
+ self.spark.sql(query).fillna(2, ["a", "b"]).toPandas(),
)
- self.assertTrue(
- self.connect.sql(query)
- .na.fill({"a": True, "b": 2})
- .toPandas()
- .equals(self.spark.sql(query).na.fill({"a": True, "b":
2}).toPandas())
+ self.assert_eq(
+ self.connect.sql(query).na.fill({"a": True, "b": 2}).toPandas(),
+ self.spark.sql(query).na.fill({"a": True, "b": 2}).toPandas(),
)
def test_empty_dataset(self):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]