corwinjoy commented on code in PR #39677:
URL: https://github.com/apache/arrow/pull/39677#discussion_r1456619598
##########
cpp/src/parquet/page_index.cc:
##########
@@ -388,6 +436,56 @@ class PageIndexReaderImpl : public PageIndexReader {
return nullptr;
}
+ /// Method to read full set of OffsetIndex pages
+ /// Key feature is that this does not require a full set of metadata
+ /// Only rowgroup 0 metadata is needed.
+ std::vector<ColumnOffsets> GetAllOffsets() override{
+ std::shared_ptr<RowGroupMetaData> row_group_metadata =
file_metadata_->RowGroup(0);
+ int32_t rowgroup_len = 0; // This rowgroup length is just an estimate, may
vary by rowgroup
+ int64_t offset_index_start = -1;
+ int64_t total_rows = file_metadata_->num_rows();
+ int64_t chunk_rows = row_group_metadata->num_rows();
+ // Don't use row_group count from metadata since may be dummy with only
rowgroup 0
+ int num_row_groups = ceil(static_cast<double>(total_rows) /
static_cast<double>(chunk_rows));
+ int num_columns = file_metadata_->num_columns();
+ // TODO add methods to get offset_index_start and rowgroup_len directly
+ // This is because ColumnChunk creation is super expensive.
+ auto col_chunk = row_group_metadata->ColumnChunk(0);
+ auto offset_index_location = col_chunk->GetOffsetIndexLocation();
+ offset_index_start = offset_index_location->offset;
+ rowgroup_len = offset_index_location->length * num_columns;
+
+ // Retrieve 1.5x the estimated size to allow for variation in storing pages
+ // This is just a guess, but we can go over because metadata comes after
offsets
+ // So, we can retrieve a slightly larger buffer here
+ float overhead_factor = 1.5;
+ int32_t est_offset_index_size = num_row_groups * rowgroup_len *
overhead_factor;
+ std::shared_ptr<::arrow::Buffer> offset_index_buffer;
+ PARQUET_ASSIGN_OR_THROW(offset_index_buffer,
+ input_->ReadAt(offset_index_start,
+ est_offset_index_size));
+
+ // Perform a direct read against the buffer for performance
+ ThriftDeserializer deserializer(properties_);
+ uint32_t len_used(est_offset_index_size);
+
deserializer.SetInternalBuffer(const_cast<uint8_t*>(offset_index_buffer->data()),
&len_used);
+
+ std::vector<ColumnOffsets> rowgroup_offsets;
+ rowgroup_offsets.reserve(num_row_groups);
+ format::OffsetIndex offset_index;
+ for (int rg = 0; rg < num_row_groups; ++rg) {
+ ColumnOffsets offset_indexes;
+ offset_indexes.reserve(num_columns);
+ for (int col = 0; col < num_columns; ++col) {
+
deserializer.DeserializeUnencryptedMessageUsingInternalBuffer(&offset_index);
+ auto offset_index_ptr =
std::make_shared<OffsetIndexImpl>(offset_index);
+ offset_indexes.emplace_back(std::move(offset_index_ptr));
+ }
+ rowgroup_offsets.emplace_back(std::move(offset_indexes));
+ }
+ return rowgroup_offsets;
+ }
+
Review Comment:
What I would love to do is only read the OffsetIndex entries that we need.
That is, just the rowgroups and column indexes that are required for a random
access read. Sadly, I don't think that is possible because the OffsetIndex
entries are written in thrift Compact protocol. This means that encoded page
addresses may be of variable size. (ARGH!!). I think it would make a lot of
sense to propose a parquet format enhancement where the index entries are
written using Binary protocol with a fixed size. This would allow for random
access and make data access A LOT faster.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]