lidavidm commented on a change in pull request #12662:
URL: https://github.com/apache/arrow/pull/12662#discussion_r830647715



##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1058,18 +1109,73 @@ class MergedGenerator {
       return source();
     }
 
+    void SignalErrorUnlocked() {
+      broken = true;
+      // Empty any results that have arrived but not asked for.
+      while (!delivered_jobs.empty()) {
+        delivered_jobs.pop_front();
+      }
+    }
+
+    void Purge() {
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front()->MarkFinished(IterationEnd<T>());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    void MarkFinished() {

Review comment:
       it seems sometimes we MarkFinished directly, and other times we use this 
function - maybe this should be renamed `MarkFinishedAndPurge` or something to 
make it clear why they're different?

##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1058,18 +1109,73 @@ class MergedGenerator {
       return source();
     }
 
+    void SignalErrorUnlocked() {
+      broken = true;
+      // Empty any results that have arrived but not asked for.
+      while (!delivered_jobs.empty()) {
+        delivered_jobs.pop_front();
+      }
+    }
+
+    void Purge() {
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front()->MarkFinished(IterationEnd<T>());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    void MarkFinished() {
+      all_finished.MarkFinished();
+      Purge();
+    }
+
+    // This is called outside the mutex but it is only ever called
+    // once and Future<>::AddCallback is thread-safe
+    void MarkFinalError(const Status& err, Future<T> maybe_sink) {
+      if (maybe_sink.is_valid()) {
+        // Someone is waiting for this error so lets mark it complete when
+        // all the work is done
+        // all_finished will get called by something with a strong pointer to 
state
+        // so we can safely capture this
+        all_finished.AddCallback([maybe_sink, err](const Status& status) 
mutable {
+          maybe_sink.MarkFinished(err);
+        });
+      } else {
+        // No one is waiting for this error right now so it will be delivered
+        // next.
+        final_error = err;
+      }
+    }
+
+    bool IsComplete() {
+      return outstanding_requests == 0 &&
+             (broken || (source_exhausted && num_running_subscriptions == 0 &&
+                         delivered_jobs.empty()));
+    }
+
+    bool MarkTaskFinishedUnlocked() {
+      --outstanding_requests;
+      return IsComplete();
+    }
+
     AsyncGenerator<AsyncGenerator<T>> source;
     // active_subscriptions and delivered_jobs will be bounded by 
max_subscriptions
     std::vector<AsyncGenerator<T>> active_subscriptions;
     std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
     // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will 
provide the
     // backpressure
     std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+    // A future that will be marked complete when the terminal item has 
arrived and all
+    // outstanding futures have completed.  It is used to hold off emission of 
an error
+    // until all outstanding work is done.
+    Future<> all_finished = Future<>::Make();
     util::Mutex mutex;
     bool first;
+    bool broken;
     bool source_exhausted;

Review comment:
       But with multiple boolean flags it's also already hard to reason about 
behavior in different situations

##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1058,18 +1109,73 @@ class MergedGenerator {
       return source();
     }
 
+    void SignalErrorUnlocked() {
+      broken = true;
+      // Empty any results that have arrived but not asked for.
+      while (!delivered_jobs.empty()) {
+        delivered_jobs.pop_front();
+      }
+    }
+
+    void Purge() {

Review comment:
       nit: it appears Purge() can be inlined into MarkFinished()

##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1058,18 +1109,73 @@ class MergedGenerator {
       return source();
     }
 
+    void SignalErrorUnlocked() {

Review comment:
       nit, but it seems inconsistent which ones are explicitly tagged 
`Unlocked`; it seems all except `PullSource` and `MarkFinalError` assume the 
mutex is currently held (maybe taking a `const GuardType&` would be more 
explicit about this precondition)

##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1058,18 +1109,73 @@ class MergedGenerator {
       return source();
     }
 
+    void SignalErrorUnlocked() {
+      broken = true;
+      // Empty any results that have arrived but not asked for.
+      while (!delivered_jobs.empty()) {
+        delivered_jobs.pop_front();
+      }
+    }
+
+    void Purge() {
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front()->MarkFinished(IterationEnd<T>());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    void MarkFinished() {
+      all_finished.MarkFinished();
+      Purge();
+    }
+
+    // This is called outside the mutex but it is only ever called
+    // once and Future<>::AddCallback is thread-safe
+    void MarkFinalError(const Status& err, Future<T> maybe_sink) {
+      if (maybe_sink.is_valid()) {
+        // Someone is waiting for this error so lets mark it complete when
+        // all the work is done
+        // all_finished will get called by something with a strong pointer to 
state
+        // so we can safely capture this
+        all_finished.AddCallback([maybe_sink, err](const Status& status) 
mutable {
+          maybe_sink.MarkFinished(err);
+        });
+      } else {
+        // No one is waiting for this error right now so it will be delivered
+        // next.
+        final_error = err;
+      }
+    }
+
+    bool IsComplete() {
+      return outstanding_requests == 0 &&
+             (broken || (source_exhausted && num_running_subscriptions == 0 &&
+                         delivered_jobs.empty()));
+    }
+
+    bool MarkTaskFinishedUnlocked() {
+      --outstanding_requests;
+      return IsComplete();
+    }
+
     AsyncGenerator<AsyncGenerator<T>> source;
     // active_subscriptions and delivered_jobs will be bounded by 
max_subscriptions
     std::vector<AsyncGenerator<T>> active_subscriptions;
     std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
     // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will 
provide the
     // backpressure
     std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+    // A future that will be marked complete when the terminal item has 
arrived and all
+    // outstanding futures have completed.  It is used to hold off emission of 
an error
+    // until all outstanding work is done.
+    Future<> all_finished = Future<>::Make();
     util::Mutex mutex;
     bool first;
+    bool broken;
     bool source_exhausted;

Review comment:
       It feels like there's a state machine we're moving through but the 
number of possible states also seems quite large…so I'm not sure if that'd 
actually make things clearer

##########
File path: cpp/src/arrow/util/async_generator.h
##########
@@ -1120,43 +1272,44 @@ class MergedGenerator {
 
   struct OuterCallback {
     void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
-      bool should_purge = false;
       bool should_continue = false;
+      bool should_mark_gen_complete = false;

Review comment:
       this is actually something like `all_requests_fulfilled`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to