lidavidm commented on a change in pull request #9620: URL: https://github.com/apache/arrow/pull/9620#discussion_r591953537
########## File path: cpp/src/parquet/arrow/reader.cc ########## @@ -967,6 +978,74 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups, return Status::OK(); } +/// Given a file reader and a list of row groups, this is a generator of record +/// batch vectors (where each vector is the contents of a single row group). +class RowGroupGenerator { + public: + using Item = ::arrow::util::optional<::arrow::RecordBatchVector>; + + explicit RowGroupGenerator(FileReaderImpl* self, std::vector<int> row_groups, + std::vector<int> column_indices) + : self_(self), + index_(0), + row_groups_(std::move(row_groups)), + column_indices_(std::move(column_indices)) {} + + ::arrow::Future<Item> operator()() { + if (index_ >= row_groups_.size()) { + return ::arrow::Future<Item>::MakeFinished(::arrow::util::nullopt); + } + int row_group = row_groups_[index_++]; + FileReaderImpl* self = self_; + std::vector<int> column_indices = column_indices_; + ARROW_ASSIGN_OR_RAISE(auto fut, + ::arrow::internal::GetCpuThreadPool()->Submit( Review comment: Yes, this is to allow for parallelism when scanning a Parquet file. Though as I think about it, maybe this isn't necessary? We can do the parallelism at the scan task level already. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org