emkornfield commented on code in PR #39153:
URL: https://github.com/apache/arrow/pull/39153#discussion_r1423502193
##########
cpp/src/parquet/file_reader.cc:
##########
@@ -106,41 +137,23 @@ std::shared_ptr<ColumnReader>
RowGroupReader::ColumnWithExposeEncoding(
int i, ExposedEncoding encoding_to_expose) {
std::shared_ptr<ColumnReader> reader = Column(i);
- if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
- // Check the encoding_stats to see if all data pages are dictionary
encoded.
- std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
- const std::vector<PageEncodingStats>& encoding_stats =
col->encoding_stats();
- if (encoding_stats.empty()) {
- // Some parquet files may have empty encoding_stats. In this case we are
- // not sure whether all data pages are dictionary encoded. So we do not
- // enable exposing dictionary.
- return reader;
- }
- // The 1st page should be the dictionary page.
- if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
- (encoding_stats[0].encoding != Encoding::PLAIN &&
- encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
- return reader;
- }
- // The following pages should be dictionary encoded data pages.
- for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
- if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
- encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
- (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
- encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
- return reader;
- }
- }
- } else {
- // Exposing other encodings are not supported for now.
- return reader;
+ if (encoding_to_expose == ExposedEncoding::DICTIONARY &&
+ IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i))) {
+ // Set exposed encoding.
+ reader->SetExposedEncoding(encoding_to_expose);
}
- // Set exposed encoding.
- reader->SetExposedEncoding(encoding_to_expose);
return reader;
}
+std::shared_ptr<internal::RecordReader>
RowGroupReader::RecordReaderWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose) {
+ return RecordReader(
+ i,
+ /*read_dictionary=*/encoding_to_expose == ExposedEncoding::DICTIONARY &&
+ IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i)));
Review Comment:
I guess it depends on how expensive the initialization is?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]