ueshin commented on code in PR #54162:
URL: https://github.com/apache/spark/pull/54162#discussion_r2771780042
##########
python/pyspark/pandas/tests/groupby/test_stat.py:
##########
@@ -35,11 +39,23 @@ def _test_stat_func(self, func, check_exact=True):
# Against SeriesGroupBy
(pdf.groupby("A")["B"], psdf.groupby("A")["B"]),
]:
- self.assert_eq(
- func(p_groupby_obj).sort_index(),
- func(ps_groupby_obj).sort_index(),
- check_exact=check_exact,
- )
+ if expected_error is None:
+ self.assert_eq(
+ func(p_groupby_obj).sort_index(),
+ func(ps_groupby_obj).sort_index(),
+ check_exact=check_exact,
+ )
+ else:
+ error_type, error_message = expected_error
+ self.assertRaisesRegex(error_type, error_message, lambda:
func(p_groupby_obj))
+ self.assertRaisesRegex(error_type, error_message, lambda:
func(ps_groupby_obj))
+
+ @property
+ def expected_error_numeric_only(self) -> Optional[Tuple[Type[Exception],
str]]:
+ if LooseVersion(pd.__version__) >= LooseVersion("3.0"):
Review Comment:
Sure, let's basically do `<` check first for if-else.
##########
python/pyspark/pandas/tests/groupby/test_stat.py:
##########
@@ -14,18 +14,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+from typing import Optional, Tuple, Type
import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class GroupbyStatTestingFuncMixin:
# TODO: All statistical functions should leverage this utility
- def _test_stat_func(self, func, check_exact=True):
+ def _test_stat_func(
+ self, func, check_exact=True, expected_error:
Optional[Tuple[Type[Exception], str]] = None
Review Comment:
That's fine to just check the error type.
##########
python/pyspark/pandas/tests/groupby/test_stat.py:
##########
@@ -35,11 +39,23 @@ def _test_stat_func(self, func, check_exact=True):
# Against SeriesGroupBy
(pdf.groupby("A")["B"], psdf.groupby("A")["B"]),
]:
- self.assert_eq(
- func(p_groupby_obj).sort_index(),
- func(ps_groupby_obj).sort_index(),
- check_exact=check_exact,
- )
+ if expected_error is None:
+ self.assert_eq(
+ func(p_groupby_obj).sort_index(),
+ func(ps_groupby_obj).sort_index(),
+ check_exact=check_exact,
+ )
+ else:
+ error_type, error_message = expected_error
+ self.assertRaisesRegex(error_type, error_message, lambda:
func(p_groupby_obj))
+ self.assertRaisesRegex(error_type, error_message, lambda:
func(ps_groupby_obj))
+
+ @property
+ def expected_error_numeric_only(self) -> Optional[Tuple[Type[Exception],
str]]:
Review Comment:
Let's keep it for now.
I don't think we will drop pandas 2 support so soon, then we will also put
pandas version check here and there for not-that-short time, although it
depends on how pandas API should behave. So far it will change the behavior
based on the installed pandas. Even if we only support pandas 3 soon, we can
remove this when we decide it.
##########
python/pyspark/pandas/tests/groupby/test_stat.py:
##########
@@ -60,21 +76,33 @@ def psdf(self):
def test_mean(self):
self._test_stat_func(lambda groupby_obj:
groupby_obj.mean(numeric_only=True))
+ if LooseVersion(pd.__version__) >= LooseVersion("3.0"):
Review Comment:
The error pandas 2 raises is different from what we check here. I don't
think we should check it for pandas 2, but pandas 3 raises the same error,
which I think we should check.
##########
python/pyspark/pandas/groupby.py:
##########
@@ -492,6 +493,7 @@ def first(self, numeric_only: Optional[bool] = False,
min_count: int = -1) -> Fr
a 1.0 True 3.0
b NaN None NaN
"""
+ validate_numeric_only(numeric_only)
Review Comment:
When we drop pandas 2 support, we only need to remove the version check in
the function; otherwise we need to remove it everywhere.
I'd rather keep it even after we drop pandas 2 support to make sure the
error is the same across all the places.
> BTW this should be a TypeError.
pandas 3 raises `ValueError` as we see in the tests. We should follow it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]