pitrou commented on a change in pull request #11763:
URL: https://github.com/apache/arrow/pull/11763#discussion_r776322506
##########
File path: cpp/src/arrow/dataset/file_orc.cc
##########
@@ -85,24 +85,20 @@ class OrcScanTask : public ScanTask {
included_fields.push_back(name);
}
+ std::shared_ptr<RecordBatchReader> recordBatchReader;
+ reader->NextBatchReader(scan_options.batch_size, included_fields,
&recordBatchReader);
Review comment:
Can you check the returned status here?
##########
File path: cpp/src/arrow/adapters/orc/adapter.cc
##########
@@ -439,8 +439,28 @@ class ORCFileReader::Impl {
return Status::OK();
}
+ Status NextBatchReader(int64_t batch_size, const std::vector<std::string>&
include_names,
+ std::shared_ptr<RecordBatchReader>* out) {
Review comment:
Can you refactor this so as to share most code with the other method
above?
##########
File path: cpp/src/arrow/adapters/orc/adapter.h
##########
@@ -231,6 +231,19 @@ class ARROW_EXPORT ORCFileReader {
Status NextStripeReader(int64_t batch_size, const std::vector<int>&
include_indices,
std::shared_ptr<RecordBatchReader>* out);
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with
specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_names the selected field names to read
+ /// \param[out] out the returned stripe reader
+ Status NextBatchReader(int64_t batch_size, const std::vector<std::string>&
include_names,
Review comment:
1) Please return `Result<RecordBatchReader>`
2) Why is this called `NextBatchReader`, not `NextStripeReader` as the
others?
##########
File path: cpp/src/arrow/dataset/file_orc.cc
##########
@@ -85,24 +85,20 @@ class OrcScanTask : public ScanTask {
included_fields.push_back(name);
}
+ std::shared_ptr<RecordBatchReader> recordBatchReader;
Review comment:
We use `lower_case` for local variables.
##########
File path: cpp/src/arrow/dataset/file_orc.cc
##########
@@ -85,24 +85,20 @@ class OrcScanTask : public ScanTask {
included_fields.push_back(name);
}
+ std::shared_ptr<RecordBatchReader> recordBatchReader;
+ reader->NextBatchReader(scan_options.batch_size, included_fields,
&recordBatchReader);
+
return RecordBatchIterator(
- Impl{std::move(reader), 0, num_stripes, included_fields});
+ Impl{std::move(recordBatchReader)});
}
Result<std::shared_ptr<RecordBatch>> Next() {
- if (i_ == num_stripes_) {
- return nullptr;
- }
std::shared_ptr<RecordBatch> batch;
- // TODO (https://issues.apache.org/jira/browse/ARROW-14153)
- // pass scan_options_->batch_size
- return reader_->ReadStripe(i_++, included_fields_);
+ RETURN_NOT_OK(recordBatchReader_->ReadNext(&batch));
+ return batch;
}
- std::unique_ptr<arrow::adapters::orc::ORCFileReader> reader_;
- int i_;
- int num_stripes_;
- std::vector<std::string> included_fields_;
+ std::shared_ptr<RecordBatchReader> recordBatchReader_;
Review comment:
Lowercase here too.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]