TheNeuralBit commented on a change in pull request #14438:
URL: https://github.com/apache/beam/pull/14438#discussion_r616216141
##########
File path: sdks/python/apache_beam/dataframe/frames.py
##########
@@ -922,33 +923,79 @@ def dropna(self, **kwargs):
to_string = frame_base.wont_implement_method(
pd.Series, 'to_string', reason="non-deferred-result")
- def aggregate(self, func, axis=0, *args, **kwargs):
+ @frame_base.args_to_kwargs(pd.Series)
+ @frame_base.populate_defaults(pd.Series)
+ def aggregate(self, func, axis, *args, **kwargs):
+ if kwargs.get('skipna', False):
+ # Eagerly generate a proxy to make sure skipna is a valid argument
+ # for this aggregation method
+ _ = self._expr.proxy().aggregate(func, axis, *args, **kwargs)
+ kwargs.pop('skipna')
+ self.dropna().aggregate(func, axis, *args, **kwargs)
+
if isinstance(func, list) and len(func) > 1:
- # Aggregate each column separately, then stick them all together.
+ # level arg is ignored for multiple aggregations
+ _ = kwargs.pop('level', None)
+
+ # Aggregate with each method separately, then stick them all together.
rows = [self.agg([f], *args, **kwargs) for f in func]
return frame_base.DeferredFrame.wrap(
expressions.ComputedExpression(
'join_aggregate',
lambda *rows: pd.concat(rows), [row._expr for row in rows]))
else:
- # We're only handling a single column.
+ # We're only handling a single column. It could be 'func' or ['func'],
+ # which produce different results. 'func' produces a scalar, ['func']
+ # produces a single element Series.
base_func = func[0] if isinstance(func, list) else func
- if _is_associative(base_func) and not args and not kwargs:
+
+ if (_is_numeric(base_func) and
+ not pd.core.dtypes.common.is_numeric_dtype(self.dtype)):
+ warnings.warn(
+ f"Performing a numeric aggregation, {base_func!r}, on "
+ f"Series {self._expr.proxy().name!r} with non-numeric type "
+ f"{self.dtype!r}. This can result in runtime errors or surprising "
+ "results.")
+
+ if 'level' in kwargs:
+ # Defer to groupby.agg for level= mode
+ return self.groupby(
+ level=kwargs.pop('level'), axis=axis).agg(func, *args, **kwargs)
+
+ requires_singleton = False
+ if 'min_count' in kwargs:
+ # Eagerly generate a proxy to make sure min_count is a valid argument
+ # for this aggregation method
+ _ = self._expr.proxy().agg(func, axis, *args, **kwargs)
Review comment:
When `min_count` isn't a valid argument this just allows the exception
from pandas to bubble up, so the user should see the same thing they'd get from
pandas, and they get the error at construction time.
Your question is a good one though - we're raising the exception from
pandas' `aggregate` method here, but in fact we might have gotten here through
an aggregation method (`df.mean()`, `df.sum()`, etc..). Are the exceptions the
same? Generally yes. I added some more tests in frames_test.py to make sure
(see `test_*_invalid_kwarg_raises`).
Unfortunately in the `DataFrame` case the exceptions are slightly different:
```py
>>> df.median(min_count=3)
TypeError: median() got an unexpected keyword argument 'min_count'
>>> df.agg('median', min_count=4)
TypeError: DataFrame constructor called with incompatible data and dtype:
median() got an unexpected keyword argument 'min_count'
```
Fortunately it looks like this was actually recently fixed
[here](https://github.com/pandas-dev/pandas/pull/40543], so we should start
getting the same exceptions in pandas 1.3.x. I added
`test_df_agg_method_invalid_kwarg_raises` that is just skipped for pandas <1.3.
IMO this is an acceptable divergence for those pandas versions, it should still
be clear to the user what's wrong.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]