This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1fcc89240d GH-45227: [C++][Parquet] Enable Size Stats and Page Index
by default (#45249)
1fcc89240d is described below
commit 1fcc89240db4fe0ad798498e7410668423846118
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 17:34:44 2025 +0800
GH-45227: [C++][Parquet] Enable Size Stats and Page Index by default
(#45249)
### Rationale for this change
Benchmark data shows that enabling page index and size stats by default
does not have significant penalty.
### What changes are included in this PR?
Enable the parquet writer to generate page index and size stats by default.
### Are these changes tested?
Pass CIs.
### Are there any user-facing changes?
No.
* GitHub Issue: #45227
Authored-by: Gang Wu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 9 +++-
cpp/src/parquet/arrow/size_stats_benchmark.cc | 56 ++++++++++++++---------
cpp/src/parquet/properties.h | 6 ++-
3 files changed, 46 insertions(+), 25 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index cedcebbfb6..47a00016b9 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4371,6 +4371,7 @@ TEST_P(TestArrowWriteDictionary, Statistics) {
->data_page_version(this->GetParquetDataPageVersion())
->write_batch_size(2)
->data_pagesize(2)
+ ->disable_write_page_index()
->build();
std::unique_ptr<FileWriter> writer;
ASSERT_OK_AND_ASSIGN(
@@ -4476,6 +4477,7 @@ TEST_P(TestArrowWriteDictionary,
StatisticsUnifiedDictionary) {
->data_page_version(this->GetParquetDataPageVersion())
->write_batch_size(3)
->data_pagesize(3)
+ ->disable_write_page_index()
->build();
std::unique_ptr<FileWriter> writer;
ASSERT_OK_AND_ASSIGN(
@@ -5290,7 +5292,10 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
auto pool = ::arrow::default_memory_pool();
auto sink = CreateOutputStream();
// Limit the max number of rows in a row group to 10
- auto writer_properties =
WriterProperties::Builder().max_row_group_length(10)->build();
+ auto writer_properties = WriterProperties::Builder()
+ .max_row_group_length(10)
+ ->disable_write_page_index()
+ ->build();
auto arrow_writer_properties = default_arrow_writer_properties();
// Prepare schema
@@ -5346,7 +5351,7 @@ TEST(TestArrowReadWrite, WriteAndReadRecordBatch) {
ASSERT_EQ(10, file_metadata->RowGroup(0)->num_rows());
ASSERT_EQ(2, file_metadata->RowGroup(1)->num_rows());
- // Verify that page index is not written by default.
+ // Verify that page index is not written.
for (int i = 0; i < num_row_groups; ++i) {
auto row_group_metadata = file_metadata->RowGroup(i);
for (int j = 0; j < row_group_metadata->num_columns(); ++j) {
diff --git a/cpp/src/parquet/arrow/size_stats_benchmark.cc
b/cpp/src/parquet/arrow/size_stats_benchmark.cc
index d43a3737b1..c5c95fc614 100644
--- a/cpp/src/parquet/arrow/size_stats_benchmark.cc
+++ b/cpp/src/parquet/arrow/size_stats_benchmark.cc
@@ -80,12 +80,16 @@ int64_t GetTotalPageIndexSize(const
std::shared_ptr<::parquet::FileMetaData>& me
}
void WriteColumn(::benchmark::State& state, const
std::shared_ptr<::arrow::Table>& table,
- SizeStatisticsLevel stats_level) {
+ SizeStatisticsLevel stats_level, bool enable_page_index) {
// Use the fastest possible encoding and compression settings, to better
exhibit
// the size statistics overhead.
- auto properties = WriterProperties::Builder()
- .enable_statistics()
- ->enable_write_page_index()
+ auto builder = WriterProperties::Builder();
+ if (enable_page_index) {
+ builder.enable_write_page_index();
+ } else {
+ builder.disable_write_page_index();
+ }
+ auto properties = builder.enable_statistics()
->disable_dictionary()
->encoding(Encoding::PLAIN)
->set_size_statistics_level(stats_level)
@@ -113,17 +117,17 @@ void WriteColumn(::benchmark::State& state, const
std::shared_ptr<::arrow::Table
state.SetBytesProcessed(state.iterations() * GetTotalBytes(table));
}
-template <SizeStatisticsLevel level, typename ArrowType>
+template <SizeStatisticsLevel level, typename ArrowType, bool
enable_page_index>
void BM_WritePrimitiveColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
auto type = std::make_shared<ArrowType>();
auto array = generator.ArrayOf(type, kBenchmarkSize, kNullProbability);
auto table = ::arrow::Table::Make(
::arrow::schema({::arrow::field("column", type, kNullProbability > 0)}),
{array});
- WriteColumn(state, table, level);
+ WriteColumn(state, table, level, enable_page_index);
}
-template <SizeStatisticsLevel level, typename ArrowType>
+template <SizeStatisticsLevel level, typename ArrowType, bool
enable_page_index>
void BM_WriteListColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator generator(/*seed=*/42);
auto element_type = std::make_shared<ArrowType>();
@@ -133,33 +137,43 @@ void BM_WriteListColumn(::benchmark::State& state) {
auto table = ::arrow::Table::Make(
::arrow::schema({::arrow::field("column", list_type, kNullProbability >
0)}),
{list_array});
- WriteColumn(state, table, level);
+ WriteColumn(state, table, level, enable_page_index);
}
-BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
- ::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
::arrow::Int64Type,
+ /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
::arrow::Int64Type,
+ /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
- ::arrow::Int64Type);
+ ::arrow::Int64Type, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn,
SizeStatisticsLevel::PageAndColumnChunk,
- ::arrow::Int64Type);
+ ::arrow::Int64Type, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
- ::arrow::StringType);
+ ::arrow::StringType, /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::None,
+ ::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn, SizeStatisticsLevel::ColumnChunk,
- ::arrow::StringType);
+ ::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WritePrimitiveColumn,
SizeStatisticsLevel::PageAndColumnChunk,
- ::arrow::StringType);
+ ::arrow::StringType, /*enable_page_index=*/true);
-BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::Int64Type);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::Int64Type,
+ /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::Int64Type,
+ /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
- ::arrow::Int64Type);
+ ::arrow::Int64Type, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
- ::arrow::Int64Type);
+ ::arrow::Int64Type, /*enable_page_index=*/true);
-BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::StringType);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::StringType,
+ /*enable_page_index=*/false);
+BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::None,
::arrow::StringType,
+ /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::ColumnChunk,
- ::arrow::StringType);
+ ::arrow::StringType, /*enable_page_index=*/true);
BENCHMARK_TEMPLATE(BM_WriteListColumn, SizeStatisticsLevel::PageAndColumnChunk,
- ::arrow::StringType);
+ ::arrow::StringType, /*enable_page_index=*/true);
} // namespace parquet::benchmark
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index edaf28cd92..8ae3660014 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -164,7 +164,9 @@ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE =
Compression::UNCOMPRESSED;
-static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
+static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
+static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
+ SizeStatisticsLevel::PageAndColumnChunk;
class PARQUET_EXPORT ColumnProperties {
public:
@@ -258,7 +260,7 @@ class PARQUET_EXPORT WriterProperties {
created_by_(DEFAULT_CREATED_BY),
store_decimal_as_integer_(false),
page_checksum_enabled_(false),
- size_statistics_level_(SizeStatisticsLevel::None) {}
+ size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {}
explicit Builder(const WriterProperties& properties)
: pool_(properties.memory_pool()),