leewyang commented on code in PR #37734:
URL: https://github.com/apache/spark/pull/37734#discussion_r1061925652


##########
python/pyspark/ml/functions.py:
##########
@@ -106,6 +117,474 @@ def array_to_vector(col: Column) -> Column:
     return 
Column(sc._jvm.org.apache.spark.ml.functions.array_to_vector(_to_java_column(col)))
 
 
+def _batched(
+    data: pd.Series | pd.DataFrame | Tuple[pd.Series], batch_size: int
+) -> Iterator[pd.DataFrame]:
+    """Generator that splits a pandas dataframe/series into batches."""
+    if isinstance(data, pd.DataFrame):
+        for _, batch in data.groupby(np.arange(len(data)) // batch_size):
+            yield batch
+    else:
+        # convert (tuple of) pd.Series into pd.DataFrame
+        if isinstance(data, pd.Series):
+            df = pd.concat((data,), axis=1)
+        else:  # isinstance(data, Tuple[pd.Series]):
+            df = pd.concat(data, axis=1)
+        for _, batch in df.groupby(np.arange(len(df)) // batch_size):
+            yield batch
+
+
+def _has_tensor_cols(data: pd.Series | pd.DataFrame | Tuple[pd.Series]) -> 
bool:
+    """Check if input DataFrame contains any tensor-valued columns"""
+    if isinstance(data, pd.Series):
+        return data.dtype == np.object_ and isinstance(data.iloc[0], 
(np.ndarray, list))
+    elif isinstance(data, pd.DataFrame):
+        return any(data.dtypes == np.object_) and any(
+            [isinstance(d, (np.ndarray, list)) for d in data.iloc[0]]
+        )
+    else:  # isinstance(data, Tuple):
+        return any([d.dtype == np.object_ for d in data]) and any(
+            [isinstance(d.iloc[0], (np.ndarray, list)) for d in data]
+        )
+
+
+def predict_batch_udf(
+    predict_batch_fn: Callable[
+        [],
+        Callable[
+            [np.ndarray | List[np.ndarray]],

Review Comment:
   Added a new `PredictFunction` type to account for varargs of ndarray.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to