baibaichen commented on code in PR #39393:
URL: https://github.com/apache/arrow/pull/39393#discussion_r1451381845
##########
cpp/src/parquet/file_reader.cc:
##########
@@ -219,21 +233,35 @@ class SerializedRowGroup : public
RowGroupReader::Contents {
const ReaderProperties* properties() const override { return &properties_; }
- std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
+ std::unique_ptr<PageReader> GetColumnPageReader(
+ int i, const std::optional<RowRanges>& row_ranges,
+ const std::shared_ptr<RowGroupPageIndexReader>& index_reader) override {
// Read column chunk from the file
auto col = row_group_metadata_->ColumnChunk(i);
::arrow::io::ReadRange col_range =
ComputeColumnChunkRange(file_metadata_, source_size_,
row_group_ordinal_, i);
std::shared_ptr<ArrowInputStream> stream;
+ std::shared_ptr<PageReadStates> read_states;
+ if (row_ranges.has_value()) {
+ const auto num_rows =
file_metadata_->RowGroup(row_group_ordinal_)->num_rows();
+ ARROW_DCHECK(index_reader != nullptr);
+ auto page_locations = index_reader->GetOffsetIndex(i)->page_locations();
+ PARQUET_ASSIGN_OR_THROW(
+ read_states,
+ BuildPageReadStates(num_rows, col_range, page_locations,
row_ranges.value()));
+ }
+
if (cached_source_ && prebuffered_column_chunks_bitmap_ != nullptr &&
::arrow::bit_util::GetBit(prebuffered_column_chunks_bitmap_->data(),
i)) {
Review Comment:
Why do we still have to read all the data here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]