rtpsw commented on code in PR #14682:
URL: https://github.com/apache/arrow/pull/14682#discussion_r1029159284
##########
python/pyarrow/tests/test_udf.py:
##########
@@ -504,3 +504,49 @@ def test_input_lifetime(unary_func_fixture):
# Calling a UDF should not have kept `v` alive longer than required
v = None
assert proxy_pool.bytes_allocated() == 0
+
+
+def _record_batch_from_iters(schema, *iters):
+ arrays = [pa.array(list(v), type=schema[i].type)
+ for i, v in enumerate(iters)]
+ return pa.RecordBatch.from_arrays(arrays=arrays, schema=schema)
+
+
+def _record_batch_for_range(schema, n):
+ return _record_batch_from_iters(schema,
+ range(n, n + 10),
+ range(n + 1, n + 11))
+
+
+def datasource1(ctx):
+ """A short dataset"""
+ import pyarrow as pa
+ schema = pa.schema([('', pa.int32()), ('', pa.int32())])
+
+ class Generator:
+ def __init__(self):
+ self.n = 3
+
+ def __call__(self, ctx):
+ if self.n == 0:
+ batch = _record_batch_from_iters(schema, [], [])
+ else:
+ self.n -= 1
+ batch = _record_batch_for_range(schema, self.n)
+ return pc.udf_result_from_record_batch(batch)
+ return Generator()
Review Comment:
A solution with `yield` has crossed my mind, but I haven't tried it. I agree
this is something to look into, aiming for a nicer syntax for end-users.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]