westonpace commented on a change in pull request #9095:
URL: https://github.com/apache/arrow/pull/9095#discussion_r565967203
##########
File path: cpp/src/arrow/util/iterator_test.cc
##########
@@ -214,6 +255,270 @@ TEST(TestVectorIterator, RangeForLoop) {
ASSERT_EQ(ints_it, ints.end());
}
+template <typename T>
+Transformer<T, T> MakeFirstN(int n) {
+ int remaining = n;
+ return [remaining](T next) mutable -> Result<TransformFlow<T>> {
+ if (remaining > 0) {
+ remaining--;
+ return TransformYield(next);
+ }
+ return TransformFinish();
+ };
+}
+
+TEST(TestIteratorTransform, Truncating) {
+ auto original = VectorIt({1, 2, 3});
+ auto truncated = MakeTransformedIterator(std::move(original),
MakeFirstN<TestInt>(2));
+ AssertIteratorMatch({1, 2}, std::move(truncated));
+}
+
+TEST(TestIteratorTransform, TestPointer) {
+ auto original = VectorIt<std::shared_ptr<int>>(
+ {std::make_shared<int>(1), std::make_shared<int>(2),
std::make_shared<int>(3)});
+ auto truncated =
+ MakeTransformedIterator(std::move(original),
MakeFirstN<std::shared_ptr<int>>(2));
+ ASSERT_OK_AND_ASSIGN(auto result, truncated.ToVector());
+ ASSERT_EQ(2, result.size());
+}
+
+TEST(TestIteratorTransform, TruncatingShort) {
+ // Tests the failsafe case where we never call Finish
+ auto original = VectorIt({1});
+ auto truncated = MakeTransformedIterator<TestInt,
TestInt>(std::move(original),
+
MakeFirstN<TestInt>(2));
+ AssertIteratorMatch({1}, std::move(truncated));
+}
+
+TEST(TestAsyncUtil, Background) {
+ std::vector<TestInt> expected = {1, 2, 3};
+ auto background = BackgroundAsyncVectorIt(expected);
+ auto future = CollectAsyncGenerator(background);
+ ASSERT_FALSE(future.is_finished());
+ future.Wait();
+ ASSERT_TRUE(future.is_finished());
+ ASSERT_EQ(expected, *future.result());
+}
+
+struct SlowEmptyIterator {
+ Result<TestInt> Next() {
+ if (called_) {
+ return Status::Invalid("Should not have been called twice");
+ }
+ SleepFor(0.1);
+ return IterationTraits<TestInt>::End();
+ }
+
+ private:
+ bool called_ = false;
+};
+
+TEST(TestAsyncUtil, BackgroundRepeatEnd) {
+ // Ensure that the background iterator properly fulfills the asyncgenerator
contract
+ // and can be called after it ends.
+ auto iterator = Iterator<TestInt>(SlowEmptyIterator());
+ ASSERT_OK_AND_ASSIGN(
+ auto background_iter,
+ MakeBackgroundIterator(std::move(iterator),
internal::GetCpuThreadPool()));
+
+ auto one = background_iter();
+ auto two = background_iter();
+
+ ASSERT_TRUE(one.Wait(0.5));
+
+ if (one.is_finished()) {
+ ASSERT_EQ(IterationTraits<TestInt>::End(), *one.result());
+ }
+
+ ASSERT_TRUE(two.Wait(0.5));
+ ASSERT_TRUE(two.is_finished());
+ if (two.is_finished()) {
+ ASSERT_EQ(IterationTraits<TestInt>::End(), *two.result());
+ }
+}
+
+TEST(TestAsyncUtil, SynchronousFinish) {
+ AsyncGenerator<TestInt> generator = []() {
+ return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+ };
+ Transformer<TestInt, TestInt> skip_all = [](TestInt value) { return
TransformSkip(); };
+ auto transformed = TransformAsyncGenerator(generator, skip_all);
+ auto future = CollectAsyncGenerator(transformed);
+ ASSERT_TRUE(future.is_finished());
+ ASSERT_OK_AND_ASSIGN(auto actual, future.result());
+ ASSERT_EQ(std::vector<TestInt>(), actual);
+}
+
+TEST(TestAsyncUtil, CompleteBackgroundStressTest) {
+ auto expected = RangeVector(100);
+ std::vector<Future<std::vector<TestInt>>> futures;
+ for (unsigned int i = 0; i < 100; i++) {
+ auto background = BackgroundAsyncVectorIt(expected);
+ futures.push_back(CollectAsyncGenerator(background));
+ }
+ auto combined = All(futures);
+ combined.Wait(2);
+ if (combined.is_finished()) {
+ ASSERT_OK_AND_ASSIGN(auto completed_vectors, combined.result());
+ for (auto&& vector : completed_vectors) {
+ ASSERT_EQ(vector, expected);
+ }
+ } else {
+ FAIL() << "After 2 seconds all background iterators had not finished
collecting";
+ }
+}
+
+TEST(TestAsyncUtil, StackOverflow) {
+ int counter = 0;
+ AsyncGenerator<TestInt> generator = [&counter]() {
+ if (counter < 1000000) {
+ return Future<TestInt>::MakeFinished(counter++);
+ } else {
+ return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+ }
+ };
+ Transformer<TestInt, TestInt> discard =
+ [](TestInt next) -> Result<TransformFlow<TestInt>> { return
TransformSkip(); };
+ auto transformed = TransformAsyncGenerator(generator, discard);
+ auto collected_future = CollectAsyncGenerator(transformed);
+ ASSERT_TRUE(collected_future.Wait(5));
+ if (collected_future.is_finished()) {
+ ASSERT_OK_AND_ASSIGN(auto collected, collected_future.result());
+ ASSERT_EQ(0, collected.size());
+ }
+}
+
+TEST(TestAsyncUtil, Visit) {
+ auto generator = AsyncVectorIt({1, 2, 3});
+ unsigned int sum = 0;
+ auto sum_future = VisitAsyncGenerator<TestInt>(generator, [&sum](TestInt
item) {
+ sum += item.value;
+ return Status::OK();
+ });
+ // Should be superfluous
+ sum_future.Wait();
+ ASSERT_EQ(6, sum);
+}
+
+TEST(TestAsyncUtil, Collect) {
+ std::vector<TestInt> expected = {1, 2, 3};
+ auto generator = AsyncVectorIt(expected);
+ auto collected = CollectAsyncGenerator(generator);
+ ASSERT_EQ(expected, *collected.result());
+}
+
+template <typename T>
+Transformer<T, T> MakeRepeatN(int repeat_count) {
+ int current_repeat = 0;
+ return [repeat_count, current_repeat](T next) mutable ->
Result<TransformFlow<T>> {
+ current_repeat++;
+ bool ready_for_next = false;
+ if (current_repeat == repeat_count) {
+ current_repeat = 0;
+ ready_for_next = true;
+ }
+ return TransformYield(next, ready_for_next);
+ };
+}
+
+TEST(TestIteratorTransform, Repeating) {
+ auto original = VectorIt({1, 2, 3});
+ auto repeated = MakeTransformedIterator<TestInt,
TestInt>(std::move(original),
+
MakeRepeatN<TestInt>(2));
+ AssertIteratorMatch({1, 1, 2, 2, 3, 3}, std::move(repeated));
+}
+
+template <typename T>
+Transformer<T, T> MakeFilter(std::function<bool(T&)> filter) {
+ return [filter](T next) -> Result<TransformFlow<T>> {
+ if (filter(next)) {
+ return TransformYield(next);
+ } else {
+ return TransformSkip();
+ }
+ };
+}
+
+template <typename T>
+Transformer<T, T> MakeAbortOnSecond() {
+ int counter = 0;
+ return [counter](T next) mutable -> Result<TransformFlow<T>> {
+ if (counter++ == 1) {
+ return Status::Invalid("X");
+ }
+ return TransformYield(next);
+ };
+}
+
+TEST(TestIteratorTransform, SkipSome) {
+ // Exercises TransformSkip
+ auto original = VectorIt({1, 2, 3});
+ auto filter = MakeFilter<TestInt>([](TestInt& t) { return t.value != 2; });
+ auto filtered = MakeTransformedIterator(std::move(original), filter);
+ AssertIteratorMatch({1, 3}, std::move(filtered));
+}
+
+TEST(TestIteratorTransform, SkipAll) {
+ // Exercises TransformSkip
+ auto original = VectorIt({1, 2, 3});
+ auto filter = MakeFilter<TestInt>([](TestInt& t) { return false; });
+ auto filtered = MakeTransformedIterator(std::move(original), filter);
+ AssertIteratorMatch({}, std::move(filtered));
+}
+
+TEST(TestIteratorTransform, Abort) {
+ auto original = VectorIt({1, 2, 3});
+ auto transformed =
+ MakeTransformedIterator(std::move(original),
MakeAbortOnSecond<TestInt>());
+ ASSERT_OK(transformed.Next());
+ ASSERT_RAISES(Invalid, transformed.Next());
+}
+
+TEST(TestAsyncIteratorTransform, SkipSome) {
+ auto original = AsyncVectorIt({1, 2, 3});
+ auto filter = MakeFilter<TestInt>([](TestInt& t) { return t.value != 2; });
+ auto filtered = TransformAsyncGenerator(std::move(original), filter);
+ AssertAsyncGeneratorMatch({1, 3}, std::move(filtered));
+}
+
+TEST(TestAsyncUtil, ReadaheadFailed) {
+ auto source = []() -> Future<TestInt> {
+ return Future<TestInt>::MakeFinished(Status::Invalid("X"));
+ };
+ auto readahead = AddReadahead<TestInt>(source, 10);
+ auto next = readahead();
+ ASSERT_EQ(Status::Invalid("X"), next.status());
+}
+
+TEST(TestAsyncUtil, Readahead) {
+ int num_delivered = 0;
+ auto source = [&num_delivered]() {
+ if (num_delivered < 5) {
+ return Future<TestInt>::MakeFinished(num_delivered++);
+ } else {
+ return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
+ }
+ };
+ auto readahead = AddReadahead<TestInt>(source, 10);
+ // Should not pump until first item requested
Review comment:
I'm not sure that it is less useful for an async reader. It isn't a
matter of blocking either. The threaded table reader could have the same
behavior. Also, the serial table reader was already different from the
threaded table reader in this regard (the async reader mimics the serial
reader)...
```
ARROW_ASSIGN_OR_RAISE(auto csv_reader, arrow::csv::TableReader::Make(...);
// Should the read start here?
ARROW_ASSIGN_OR_RAISE(auto table, csv_reader->Read());
// Or here?
```
As a user I really wouldn't expect the file to start getting read until I
called `Read()`. I agree that it shouldn't have much impact though and I'd be
content to change it.
##########
File path: cpp/src/arrow/util/task_group.h
##########
@@ -63,6 +63,20 @@ class ARROW_EXPORT TaskGroup : public
std::enable_shared_from_this<TaskGroup> {
/// task (or subgroup).
virtual Status Finish() = 0;
+ /// Returns a future that will complete the first time all tasks are
finished.
+ /// This should be called only after all top level tasks
+ /// have been added to the task group.
+ ///
+ /// If you are using a TaskGroup asyncrhonously there are a few
considerations to keep
Review comment:
Fixed
##########
File path: cpp/src/arrow/util/task_group.cc
##########
@@ -135,6 +149,18 @@ class ThreadedTaskGroup : public TaskGroup {
// before cv.notify_one() has returned
std::unique_lock<std::mutex> lock(mutex_);
cv_.notify_one();
+ if (completion_future_.has_value()) {
+ // MarkFinished could be slow. We don't want to call it while we are
holding
+ // the lock.
+ // TODO: If optional is thread safe then we can skip this locking
entirely
Review comment:
Dropped the TODO
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]