This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 113ac8e [SPARK-37657][FOLLOWUP][PYTHON] Separate the tests for pandas
< 1.1.0
113ac8e is described below
commit 113ac8e36b6210f7279703aaf58452a699dd78b2
Author: itholic <[email protected]>
AuthorDate: Thu Dec 30 14:11:18 2021 +0900
[SPARK-37657][FOLLOWUP][PYTHON] Separate the tests for pandas < 1.1.0
### What changes were proposed in this pull request?
This follow-ups for SPARK-37657 to separate the test based on pandas
version, as the supported minimum pandas version is 1.0.5, but some tests
require pandas 1.1.0 and above.
For pandas < 1.1.0, we should test by manually creating the expected result
DataFrame.
### Why are the changes needed?
Because pandas < 1.1.0 doesn't support the `datetime_is_numeric` for
`(Series|DataFrame).describe()` as below:
```python
>>> pdf.describe(datetime_is_numeric=True)
Traceback (most recent call last):
...
TypeError: describe() got an unexpected keyword argument
'datetime_is_numeric'
```
But we support pandas 1.0.5 as minimum version.
### Does this PR introduce _any_ user-facing change?
No, it's test only
### How was this patch tested?
Unittests
Closes #35063 from itholic/SPARK-37657-fu.
Authored-by: itholic <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/tests/test_dataframe.py | 298 +++++++++++++++++++++-----
1 file changed, 248 insertions(+), 50 deletions(-)
diff --git a/python/pyspark/pandas/tests/test_dataframe.py
b/python/pyspark/pandas/tests/test_dataframe.py
index 4d9467d..0a66d4c 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -5830,10 +5830,40 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
# warning and adopt the future behavior now.
# NOTE: Compare the result except percentiles, since we use
approximate percentile
# so the result is different from pandas.
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pdf.describe(datetime_is_numeric=True).astype(str).loc[["count",
"mean", "min", "max"]],
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pdf.describe(datetime_is_numeric=True)
+ .astype(str)
+ .loc[["count", "mean", "min", "max"]],
+ )
+ else:
+ self.assert_eq(
+ psdf.describe(),
+ ps.DataFrame(
+ {
+ "A": [
+ "4",
+ "2021-07-16 18:00:00",
+ "2020-10-20 00:00:00",
+ "2020-10-20 00:00:00",
+ "2021-06-02 00:00:00",
+ "2021-06-02 00:00:00",
+ "2022-07-11 00:00:00",
+ ],
+ "B": [
+ "4",
+ "2024-08-02 18:00:00",
+ "2021-11-20 00:00:00",
+ "2021-11-20 00:00:00",
+ "2023-06-02 00:00:00",
+ "2026-07-11 00:00:00",
+ "2026-07-11 00:00:00",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ ),
+ )
# String & timestamp columns
psdf = ps.DataFrame(
@@ -5848,16 +5878,45 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
}
)
pdf = psdf.to_pandas()
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pdf.describe(datetime_is_numeric=True).astype(str).loc[["count",
"mean", "min", "max"]],
- )
- psdf.A += psdf.A
- pdf.A += pdf.A
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pdf.describe(datetime_is_numeric=True).astype(str).loc[["count",
"mean", "min", "max"]],
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pdf.describe(datetime_is_numeric=True)
+ .astype(str)
+ .loc[["count", "mean", "min", "max"]],
+ )
+ psdf.A += psdf.A
+ pdf.A += pdf.A
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pdf.describe(datetime_is_numeric=True)
+ .astype(str)
+ .loc[["count", "mean", "min", "max"]],
+ )
+ else:
+ expected_result = ps.DataFrame(
+ {
+ "B": [
+ "4",
+ "2024-08-02 18:00:00",
+ "2021-11-20 00:00:00",
+ "2021-11-20 00:00:00",
+ "2023-06-02 00:00:00",
+ "2026-07-11 00:00:00",
+ "2026-07-11 00:00:00",
+ ]
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ )
+ self.assert_eq(
+ psdf.describe(),
+ expected_result,
+ )
+ psdf.A += psdf.A
+ self.assert_eq(
+ psdf.describe(),
+ expected_result,
+ )
# Numeric & timestamp columns
psdf = ps.DataFrame(
@@ -5872,20 +5931,61 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
}
)
pdf = psdf.to_pandas()
- pandas_result = pdf.describe(datetime_is_numeric=True)
- pandas_result.B = pandas_result.B.astype(str)
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pandas_result.loc[["count", "mean", "min", "max"]],
- )
- psdf.A += psdf.A
- pdf.A += pdf.A
- pandas_result = pdf.describe(datetime_is_numeric=True)
- pandas_result.B = pandas_result.B.astype(str)
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pandas_result.loc[["count", "mean", "min", "max"]],
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ pandas_result = pdf.describe(datetime_is_numeric=True)
+ pandas_result.B = pandas_result.B.astype(str)
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pandas_result.loc[["count", "mean", "min", "max"]],
+ )
+ psdf.A += psdf.A
+ pdf.A += pdf.A
+ pandas_result = pdf.describe(datetime_is_numeric=True)
+ pandas_result.B = pandas_result.B.astype(str)
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pandas_result.loc[["count", "mean", "min", "max"]],
+ )
+ else:
+ self.assert_eq(
+ psdf.describe(),
+ ps.DataFrame(
+ {
+ "A": [4, 2, 1, 1, 2, 2, 3, 0.816497],
+ "B": [
+ "4",
+ "2024-08-02 18:00:00",
+ "2021-11-20 00:00:00",
+ "2021-11-20 00:00:00",
+ "2023-06-02 00:00:00",
+ "2026-07-11 00:00:00",
+ "2026-07-11 00:00:00",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max",
"std"],
+ ),
+ )
+ psdf.A += psdf.A
+ self.assert_eq(
+ psdf.describe(),
+ ps.DataFrame(
+ {
+ "A": [4, 4, 2, 2, 4, 4, 6, 1.632993],
+ "B": [
+ "4",
+ "2024-08-02 18:00:00",
+ "2021-11-20 00:00:00",
+ "2021-11-20 00:00:00",
+ "2023-06-02 00:00:00",
+ "2026-07-11 00:00:00",
+ "2026-07-11 00:00:00",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max",
"std"],
+ ),
+ )
# Include None column
psdf = ps.DataFrame(
@@ -5896,12 +5996,33 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
}
)
pdf = psdf.to_pandas()
- pandas_result = pdf.describe(datetime_is_numeric=True)
- pandas_result.b = pandas_result.b.astype(str)
- self.assert_eq(
- psdf.describe().loc[["count", "mean", "min", "max"]],
- pandas_result.loc[["count", "mean", "min", "max"]],
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ pandas_result = pdf.describe(datetime_is_numeric=True)
+ pandas_result.b = pandas_result.b.astype(str)
+ self.assert_eq(
+ psdf.describe().loc[["count", "mean", "min", "max"]],
+ pandas_result.loc[["count", "mean", "min", "max"]],
+ )
+ else:
+ self.assert_eq(
+ psdf.describe(),
+ ps.DataFrame(
+ {
+ "a": [3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 3.0, 1.0],
+ "b": [
+ "3",
+ "1970-01-01 00:00:00.000001",
+ "1970-01-01 00:00:00.000001",
+ "1970-01-01 00:00:00.000001",
+ "1970-01-01 00:00:00.000001",
+ "1970-01-01 00:00:00.000001",
+ "1970-01-01 00:00:00.000001",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max",
"std"],
+ ),
+ )
msg = r"Percentiles should all be in the interval \[0, 1\]"
with self.assertRaisesRegex(ValueError, msg):
@@ -5947,23 +6068,81 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
pdf = psdf.to_pandas()
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
- pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
- self.assert_eq(
- psdf[psdf.a != psdf.a].describe(),
- pdf_result.where(pdf_result.notnull(), None).astype(str),
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ pdf_result.where(pdf_result.notnull(), None).astype(str),
+ )
+ else:
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ ps.DataFrame(
+ {
+ "a": [
+ "0",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ ],
+ "b": [
+ "0",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ ),
+ )
# Explicit empty DataFrame numeric & timestamp
psdf = ps.DataFrame(
{"a": [1, 2, 3], "b": [pd.Timestamp(1), pd.Timestamp(1),
pd.Timestamp(1)]}
)
pdf = psdf.to_pandas()
- pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
- pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(),
None).astype(str)
- self.assert_eq(
- psdf[psdf.a != psdf.a].describe(),
- pdf_result,
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
+ pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(),
None).astype(str)
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ pdf_result,
+ )
+ else:
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ ps.DataFrame(
+ {
+ "a": [
+ 0,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ ],
+ "b": [
+ "0",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max",
"std"],
+ ),
+ )
# Explicit empty DataFrame numeric & string
psdf = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
@@ -5978,11 +6157,30 @@ class DataFrameTest(PandasOnSparkTestCase,
SQLTestUtils):
{"a": ["a", "b", "c"], "b": [pd.Timestamp(1), pd.Timestamp(1),
pd.Timestamp(1)]}
)
pdf = psdf.to_pandas()
- pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
- self.assert_eq(
- psdf[psdf.a != psdf.a].describe(),
- pdf_result.where(pdf_result.notnull(), None).astype(str),
- )
+ if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
+ pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ pdf_result.where(pdf_result.notnull(), None).astype(str),
+ )
+ else:
+ self.assert_eq(
+ psdf[psdf.a != psdf.a].describe(),
+ ps.DataFrame(
+ {
+ "b": [
+ "0",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ "None",
+ ],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ ),
+ )
def test_getitem_with_none_key(self):
psdf = self.psdf
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]