HyukjinKwon commented on code in PR #38624:
URL: https://github.com/apache/spark/pull/38624#discussion_r1375549650
##########
python/pyspark/sql/pandas/group_ops.py:
##########
@@ -30,13 +30,15 @@
PandasGroupedMapFunction,
PandasGroupedMapFunctionWithState,
PandasCogroupedMapFunction,
+ ArrowGroupedMapFunction,
+ ArrowCogroupedMapFunction,
)
from pyspark.sql.group import GroupedData
class PandasGroupedOpsMixin:
"""
- Min-in for pandas grouped operations. Currently, only :class:`GroupedData`
+ Min-in for Pandas grouped operations. Currently, only :class:`GroupedData`
Review Comment:
Let's just keep it as `pandas`, and below too.
##########
python/pyspark/sql/pandas/group_ops.py:
##########
@@ -354,6 +356,132 @@ def applyInPandasWithState(
)
return DataFrame(jdf, self.session)
+ def applyInArrow(
+ self, func: "ArrowGroupedMapFunction", schema: Union[StructType, str]
+ ) -> "DataFrame":
+ """
+ Maps each group of the current :class:`DataFrame` using an Arrow udf
and returns the result
+ as a `DataFrame`.
+
+ The function should take a `pyarrow.Table` and return another
+ `pyarrow.Table`. Alternatively, the user can pass a function that takes
+ a tuple of `pyarrow.Scalar` grouping key(s) and a `pyarrow.Table`.
+ For each group, all columns are passed together as a `pyarrow.Table`
+ to the user-function and the returned `pyarrow.Table` are combined as a
+ :class:`DataFrame`.
+
+ The `schema` should be a :class:`StructType` describing the schema of
the returned
+ `pyarrow.Table`. The column labels of the returned `pyarrow.Table`
must either match
+ the field names in the defined schema if specified as strings, or
match the
+ field data types by position if not strings, e.g. integer indices.
+ The length of the returned `pyarrow.Table` can be arbitrary.
+
+ .. versionadded:: 3.4.0
Review Comment:
4.0.0
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]