Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/22305#discussion_r231413246
--- Diff: python/pyspark/worker.py ---
@@ -154,6 +154,47 @@ def wrapped(*series):
return lambda *a: (wrapped(*a), arrow_return_type)
+def wrap_bounded_window_agg_pandas_udf(f, return_type):
+ arrow_return_type = to_arrow_type(return_type)
+
+ def wrapped(begin_index, end_index, *series):
+ import numpy as np
+ import pandas as pd
+ result = []
+ for i in range(0, len(begin_index)):
+ begin = begin_index[i]
+ end = end_index[i]
+ range_index = np.arange(begin, end)
+ # Note: Create a slice from a series is actually pretty
expensive to
+ # do for each window. However, there is no way to
reduce/eliminate
+ # the cost of creating sub series here AFAIK.
+ # TODO: s.take might be the best way to create sub series
+ series_slices = [s.take(range_index) for s in series]
--- End diff --
We can also use `s[begin:end]`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]