fsaintjacques commented on a change in pull request #7180: URL: https://github.com/apache/arrow/pull/7180#discussion_r428676760
########## File path: cpp/src/arrow/dataset/scanner.cc ########## @@ -177,32 +177,38 @@ static inline RecordBatchVector FlattenRecordBatchVector( return flattened; } +struct TableAssemblyState { + /// Protecting mutating accesses to batches + std::mutex mutex{}; + std::vector<RecordBatchVector> batches{}; + + void Emplace(RecordBatchVector b, size_t position) { + std::lock_guard<std::mutex> lock(mutex); + if (batches.size() <= position) { + batches.resize(position + 1); + } + batches[position] = std::move(b); + } +}; + Result<std::shared_ptr<Table>> Scanner::ToTable() { ARROW_ASSIGN_OR_RAISE(auto scan_task_it, Scan()); auto task_group = scan_context_->TaskGroup(); - // Protecting mutating accesses to batches - std::mutex mutex; - std::vector<RecordBatchVector> batches; + /// Wraps the state in a shared_ptr to ensure that a failing ScanTask don't + /// invalidate the concurrent running tasks because Finish() early returns + /// and the mutex/batches may got out of scope. Review comment: credits goes to @jorisvandenbossche :) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org