This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 42f27abf56 GH-47978: [C++][Parquet][CI] Add more compression codecs to
fuzzing seed corpus (#47979)
42f27abf56 is described below
commit 42f27abf56ac2795afc37a506a6e74f07b278b9c
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Oct 29 17:31:21 2025 +0100
GH-47978: [C++][Parquet][CI] Add more compression codecs to fuzzing seed
corpus (#47979)
### Rationale for this change
1. Add more compression codecs to seed corpus
2. Tweak fuzz target to make fuzzing slightly faster (around ~30% locally
according to my measurements), which will allow testing more mutations per day
### Are these changes tested?
Not specifically by CI, but hopefully they will make fuzzing more efficient.
### Are there any user-facing changes?
No.
* GitHub Issue: #47978
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/arrow/generate_fuzz_corpus.cc | 69 +++++++++++++++++++++------
cpp/src/parquet/arrow/reader.cc | 25 +++++++---
cpp/src/parquet/properties.h | 3 --
3 files changed, 73 insertions(+), 24 deletions(-)
diff --git a/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
b/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
index 3f9899dba8..59b1fc59e1 100644
--- a/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
+++ b/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
@@ -81,25 +81,68 @@ std::shared_ptr<Field> FieldForArray(const
std::shared_ptr<Array>& array,
}
std::vector<WriteConfig> GetWriteConfigurations() {
+ auto default_properties_builder = [] {
+ auto builder = WriterProperties::Builder();
+ // Override current default of 1MB
+ builder.data_pagesize(10'000);
+ // Reduce max dictionary page size so that less pages are dict-encoded.
+ builder.dictionary_pagesize_limit(1'000);
+ // Emit various physical types for decimal columns
+ builder.enable_store_decimal_as_integer();
+ // DataPageV2 has more interesting features such as selective compression
+ builder.data_page_version(parquet::ParquetDataPageVersion::V2);
+ return builder;
+ };
+
+ auto default_arrow_properties_builder = [] {
+ auto builder = ArrowWriterProperties::Builder();
+ // Store the Arrow schema so as to exercise more data types when reading
+ builder.store_schema();
+ return builder;
+ };
+
// clang-format off
- auto w_brotli = WriterProperties::Builder()
- .disable_dictionary("no_dict")
- ->compression("compressed", Compression::BROTLI)
- // Override current default of 1MB
- ->data_pagesize(20'000)
- // Reduce max dictionary page size so that less pages are dict-encoded.
- ->dictionary_pagesize_limit(1'000)
- // Emit various physical types for decimal columns
- ->enable_store_decimal_as_integer()
+ auto w_uncompressed = default_properties_builder()
+ .build();
+ // compressed columns with dictionary disabled
+ auto w_brotli = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::BROTLI)
+ ->build();
+ auto w_gzip = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::GZIP)
->build();
- // Store the Arrow schema so as to exercise more data types when reading
- auto a_default = ArrowWriterProperties::Builder{}
- .store_schema()
+ auto w_lz4 = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::LZ4)
->build();
+ auto w_snappy = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::SNAPPY)
+ ->build();
+ auto w_zstd = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::ZSTD)
+ ->build();
+ // v1 data pages
+ auto w_pages_v1 = default_properties_builder()
+ .disable_dictionary()
+ ->compression(Compression::LZ4)
+ ->data_page_version(parquet::ParquetDataPageVersion::V1)
+ ->build();
+
+ auto a_default = default_arrow_properties_builder().build();
// clang-format on
std::vector<WriteConfig> configs;
+ configs.push_back({w_uncompressed, a_default});
configs.push_back({w_brotli, a_default});
+ configs.push_back({w_gzip, a_default});
+ configs.push_back({w_lz4, a_default});
+ configs.push_back({w_snappy, a_default});
+ configs.push_back({w_zstd, a_default});
+ configs.push_back({w_pages_v1, a_default});
return configs;
}
@@ -255,8 +298,6 @@ Result<std::vector<Column>> ExampleColumns(int32_t length,
// TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
- // A non-dict-encoded column (see GetWriteConfigurations)
- columns.push_back({"no_dict", gen.String(length, 0, 30, null_probability)});
// A column that should be quite compressible (see GetWriteConfigurations)
columns.push_back({"compressed", gen.Int64(length, -10, 10,
null_probability)});
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index d42fdc5034..1e5419e3cb 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1414,21 +1414,32 @@ Status FuzzReader(std::unique_ptr<FileReader> reader) {
} // namespace
Status FuzzReader(const uint8_t* data, int64_t size) {
- auto buffer = std::make_shared<::arrow::Buffer>(data, size);
Status st;
- for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 1, 13,
300}) {
- auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
- FileReaderBuilder builder;
+
+ auto buffer = std::make_shared<::arrow::Buffer>(data, size);
+ auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
+ auto pool = ::arrow::default_memory_pool();
+
+ // Read Parquet file metadata only once, which will reduce iteration time
slightly
+ std::shared_ptr<FileMetaData> pq_md;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ pq_md = ParquetFileReader::Open(file)->metadata();
+ END_PARQUET_CATCH_EXCEPTIONS
+
+ // Note that very small batch sizes probably make fuzzing slower
+ for (auto batch_size : std::vector<std::optional<int>>{std::nullopt, 13,
300}) {
ArrowReaderProperties properties;
if (batch_size) {
properties.set_batch_size(batch_size.value());
}
- builder.properties(properties);
- RETURN_NOT_OK(builder.Open(std::move(file)));
+ std::unique_ptr<ParquetFileReader> pq_file_reader;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ pq_file_reader = ParquetFileReader::Open(file,
default_reader_properties(), pq_md);
+ END_PARQUET_CATCH_EXCEPTIONS
std::unique_ptr<FileReader> reader;
- RETURN_NOT_OK(builder.Build(&reader));
+ RETURN_NOT_OK(FileReader::Make(pool, std::move(pq_file_reader),
properties, &reader));
st &= FuzzReader(std::move(reader));
}
return st;
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 51b549df22..6ed22099d4 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -324,8 +324,6 @@ class PARQUET_EXPORT WriterProperties {
content_defined_chunking_options_(
properties.content_defined_chunking_options()) {}
- virtual ~Builder() {}
-
/// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
///
/// Optimize parquet files for content addressable storage (CAS) systems
by writing
@@ -1198,7 +1196,6 @@ class PARQUET_EXPORT ArrowWriterProperties {
use_threads_(kArrowDefaultUseThreads),
executor_(NULLPTR),
write_time_adjusted_to_utc_(false) {}
- virtual ~Builder() = default;
/// \brief Disable writing legacy int96 timestamps (default disabled).
Builder* disable_deprecated_int96_timestamps() {