Hor911 commented on issue #34460:
URL: https://github.com/apache/arrow/issues/34460#issuecomment-1455218992
Implementation of 2 new methods and little ReadRowGroups for dedup. Tested
with large and complex parquet files and works very well.
```
Status FileReaderImpl::WillNeedRowGroups(const std::vector<int>& row_groups,
const std::vector<int>&
column_indices) {
RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
// PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if
enabled
if (reader_properties_.pre_buffer()) {
BEGIN_PARQUET_CATCH_EXCEPTIONS
parquet_reader()->PreBuffer(row_groups, column_indices,
reader_properties_.io_context(),
reader_properties_.cache_options());
END_PARQUET_CATCH_EXCEPTIONS
}
return Status::OK();
}
Status FileReaderImpl::DecodeRowGroups(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) {
RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
/*cpu_executor=*/nullptr);
ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
return Status::OK();
}
Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
std::shared_ptr<Table>* out) {
RETURN_NOT_OK(WillNeedRowGroups(row_groups, column_indices));
auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
/*cpu_executor=*/nullptr);
ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
return Status::OK();
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]