wgtmac commented on code in PR #40594: URL: https://github.com/apache/arrow/pull/40594#discussion_r1860754016
########## cpp/src/parquet/page_index_test.cc: ########## @@ -446,23 +451,76 @@ TEST(PageIndex, WriteOffsetIndex) { /// Verify the data of the offset index. for (const auto& offset_index : offset_indexes) { ASSERT_EQ(num_pages, offset_index->page_locations().size()); + if (write_size_stats) { + ASSERT_EQ(num_pages, offset_index->unencoded_byte_array_data_bytes().size()); + } else { + ASSERT_TRUE(offset_index->unencoded_byte_array_data_bytes().empty()); + } for (size_t i = 0; i < num_pages; ++i) { const auto& page_location = offset_index->page_locations().at(i); ASSERT_EQ(offsets[i] + final_position, page_location.offset); ASSERT_EQ(page_sizes[i], page_location.compressed_page_size); ASSERT_EQ(first_row_indices[i], page_location.first_row_index); + if (write_size_stats) { + ASSERT_EQ(unencoded_byte_array_lengths[i], + offset_index->unencoded_byte_array_data_bytes()[i]); + } } } } +TEST(PageIndex, WriteOffsetIndexWithoutSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/false); +} + +TEST(PageIndex, WriteOffsetIndexWithSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/true); +} + +struct PageLevelHistogram { + std::vector<int16_t> rep_levels; + std::vector<int16_t> def_levels; +}; + +std::unique_ptr<SizeStatistics> ConstructFakeSizeStatistics( + const ColumnDescriptor* descr, const PageLevelHistogram& page_level_histogram) { + auto stats = MakeSizeStatistics(descr); + for (int16_t level = 0; level <= descr->max_repetition_level(); ++level) { + stats->repetition_level_histogram[level] = page_level_histogram.rep_levels[level]; + } + for (int16_t level = 0; level <= descr->max_definition_level(); ++level) { + stats->definition_level_histogram[level] = page_level_histogram.def_levels[level]; + } + return stats; +} + +void VerifyPageLevelHistogram(int16_t max_level, size_t page_id, + const std::vector<int16_t>& expected_page_levels, Review Comment: Now, it isn't. `expected_page_levels.size() == (max_level + 1) * num_pages`. ########## cpp/src/parquet/page_index_test.cc: ########## @@ -446,23 +451,76 @@ TEST(PageIndex, WriteOffsetIndex) { /// Verify the data of the offset index. for (const auto& offset_index : offset_indexes) { ASSERT_EQ(num_pages, offset_index->page_locations().size()); + if (write_size_stats) { + ASSERT_EQ(num_pages, offset_index->unencoded_byte_array_data_bytes().size()); + } else { + ASSERT_TRUE(offset_index->unencoded_byte_array_data_bytes().empty()); + } for (size_t i = 0; i < num_pages; ++i) { const auto& page_location = offset_index->page_locations().at(i); ASSERT_EQ(offsets[i] + final_position, page_location.offset); ASSERT_EQ(page_sizes[i], page_location.compressed_page_size); ASSERT_EQ(first_row_indices[i], page_location.first_row_index); + if (write_size_stats) { + ASSERT_EQ(unencoded_byte_array_lengths[i], + offset_index->unencoded_byte_array_data_bytes()[i]); + } } } } +TEST(PageIndex, WriteOffsetIndexWithoutSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/false); +} + +TEST(PageIndex, WriteOffsetIndexWithSizeStats) { + TestWriteOffsetIndex(/*write_size_stats=*/true); +} + +struct PageLevelHistogram { + std::vector<int16_t> rep_levels; + std::vector<int16_t> def_levels; +}; + +std::unique_ptr<SizeStatistics> ConstructFakeSizeStatistics( + const ColumnDescriptor* descr, const PageLevelHistogram& page_level_histogram) { + auto stats = MakeSizeStatistics(descr); + for (int16_t level = 0; level <= descr->max_repetition_level(); ++level) { + stats->repetition_level_histogram[level] = page_level_histogram.rep_levels[level]; + } + for (int16_t level = 0; level <= descr->max_definition_level(); ++level) { + stats->definition_level_histogram[level] = page_level_histogram.def_levels[level]; + } + return stats; +} + +void VerifyPageLevelHistogram(int16_t max_level, size_t page_id, + const std::vector<int16_t>& expected_page_levels, Review Comment: No, it isn't. `expected_page_levels.size() == (max_level + 1) * num_pages`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org