(arrow) branch main updated: GH-47597: [C++][Parquet] Fuzz more data types (#47621)

apitrou Sat, 18 Oct 2025 14:41:35 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 64f2055ffb GH-47597: [C++][Parquet] Fuzz more data types (#47621)
64f2055ffb is described below

commit 64f2055ffb68e5077420f4253e76d78952438cab
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Sep 24 09:37:06 2025 +0200

    GH-47597: [C++][Parquet] Fuzz more data types (#47621)
    
    ### Rationale for this change
    
    Our Parquet fuzzing seed corpus lacks a lot supported data types. This 
probably makes the code coverage of Parquet fuzzing tests suboptimal.
    
    ### What changes are included in this PR?
    
    Including most supported data types in the Parquet seed corpus.
    Only missing are the extension types (JSON, GEOMETRY...) because generating 
valid data for them requires more care.
    
    ### Are these changes tested?
    
    Not by regular unit tests, but the OSS-Fuzz CI job runs the corpus 
generation executable.
    
    Also, I've validated the new seed corpus locally.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #47597
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/cmake_modules/DefineOptions.cmake         |  18 +-
 cpp/src/arrow/ipc/CMakeLists.txt              |   7 +-
 cpp/src/parquet/arrow/CMakeLists.txt          |   2 +-
 cpp/src/parquet/arrow/generate_fuzz_corpus.cc | 245 ++++++++++++++++++++------
 4 files changed, 203 insertions(+), 69 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake 
b/cpp/cmake_modules/DefineOptions.cmake
index faac95c400..4f0981ef1c 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -213,7 +213,7 @@ takes precedence over ccache if a storage backend is 
configured" ON)
   define_option(ARROW_ENABLE_THREADING "Enable threading in Arrow core" ON)
 
   #----------------------------------------------------------------------
-  set_option_category("Test and benchmark")
+  set_option_category("Tests and benchmarks")
 
   define_option(ARROW_BUILD_EXAMPLES "Build the Arrow examples" OFF)
 
@@ -259,12 +259,20 @@ takes precedence over ccache if a storage backend is 
configured" ON)
                        "shared"
                        "static")
 
-  define_option(ARROW_FUZZING
-                "Build Arrow Fuzzing executables"
+  define_option(ARROW_BUILD_FUZZING_UTILITIES
+                "Build command line utilities for fuzzing"
                 OFF
                 DEPENDS
                 ARROW_TESTING
-                ARROW_WITH_BROTLI)
+                ARROW_WITH_BROTLI
+                ARROW_WITH_LZ4
+                ARROW_WITH_ZSTD)
+
+  define_option(ARROW_FUZZING
+                "Build Arrow fuzz targets"
+                OFF
+                DEPENDS
+                ARROW_BUILD_FUZZING_UTILITIES)
 
   define_option(ARROW_LARGE_MEMORY_TESTS "Enable unit tests which use large 
memory" OFF)
 
@@ -301,7 +309,7 @@ takes precedence over ccache if a storage backend is 
configured" ON)
                 DEPENDS
                 ARROW_FILESYSTEM)
 
-  define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF)
+  define_option(ARROW_BUILD_UTILITIES "Build Arrow command line utilities" OFF)
 
   define_option(ARROW_COMPUTE "Build all Arrow Compute kernels" OFF)
 
diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index 8cbe30f5ae..6e73c71d89 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -70,12 +70,7 @@ endif()
 
 add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc")
 
-if(ARROW_FUZZING
-   OR (ARROW_BUILD_UTILITIES
-       AND ARROW_TESTING
-       AND ARROW_WITH_LZ4
-       AND ARROW_WITH_ZSTD
-      ))
+if(ARROW_BUILD_FUZZING_UTILITIES)
   add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
   target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
                         ${ARROW_TEST_LINK_LIBS})
diff --git a/cpp/src/parquet/arrow/CMakeLists.txt 
b/cpp/src/parquet/arrow/CMakeLists.txt
index ac708a0e43..3913d5fe3e 100644
--- a/cpp/src/parquet/arrow/CMakeLists.txt
+++ b/cpp/src/parquet/arrow/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 arrow_install_all_headers("parquet/arrow")
 
-if(ARROW_FUZZING)
+if(ARROW_BUILD_FUZZING_UTILITIES)
   add_executable(parquet-arrow-generate-fuzz-corpus generate_fuzz_corpus.cc)
   if(ARROW_BUILD_STATIC)
     target_link_libraries(parquet-arrow-generate-fuzz-corpus parquet_static
diff --git a/cpp/src/parquet/arrow/generate_fuzz_corpus.cc 
b/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
index 33c3a1461b..acee0d0ff9 100644
--- a/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
+++ b/cpp/src/parquet/arrow/generate_fuzz_corpus.cc
@@ -20,12 +20,15 @@
 // to make fuzzing more efficient.
 
 #include <cstdlib>
+#include <functional>
 #include <iostream>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/array/util.h"
 #include "arrow/io/file.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
@@ -41,47 +44,161 @@ namespace arrow {
 
 using ::arrow::internal::CreateDir;
 using ::arrow::internal::PlatformFilename;
+using ::arrow::util::Float16;
+using ::parquet::ArrowWriterProperties;
 using ::parquet::WriterProperties;
 
 static constexpr int32_t kBatchSize = 1000;
+// This will emit several row groups
 static constexpr int32_t kChunkSize = kBatchSize * 3 / 8;
 
-std::shared_ptr<WriterProperties> GetWriterProperties() {
-  WriterProperties::Builder builder{};
-  builder.disable_dictionary("no_dict");
-  builder.compression("compressed", Compression::BROTLI);
-  return builder.build();
+struct WriteConfig {
+  std::shared_ptr<WriterProperties> writer_properties;
+  std::shared_ptr<ArrowWriterProperties> arrow_writer_properties;
+};
+
+struct Column {
+  std::string name;
+  std::shared_ptr<Array> array;
+
+  static std::function<std::string()> NameGenerator() {
+    struct Gen {
+      int num_col = 1;
+
+      std::string operator()() {
+        std::stringstream ss;
+        ss << "col_" << num_col++;
+        return std::move(ss).str();
+      }
+    };
+    return Gen{};
+  }
+};
+
+std::vector<WriteConfig> GetWriteConfigurations() {
+  // clang-format off
+  auto w_brotli = WriterProperties::Builder()
+      .disable_dictionary("no_dict")
+      ->compression("compressed", Compression::BROTLI)
+      // Override current default of 1MB
+      ->data_pagesize(20'000)
+      // Reduce max dictionary page size so that less pages are dict-encoded.
+      ->dictionary_pagesize_limit(1'000)
+      // Emit various physical types for decimal columns
+      ->enable_store_decimal_as_integer()
+      ->build();
+  // Store the Arrow schema so as to exercise more data types when reading
+  auto a_default = ArrowWriterProperties::Builder{}
+      .store_schema()
+      ->build();
+  // clang-format on
+
+  std::vector<WriteConfig> configs;
+  configs.push_back({w_brotli, a_default});
+  return configs;
 }
 
 Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
   constexpr double kNullProbability = 0.2;
 
   random::RandomArrayGenerator gen(42);
-  std::shared_ptr<Array> a, b, c, d, e, f, g, h, no_dict, compressed;
-  std::shared_ptr<Field> f_a, f_b, f_c, f_d, f_e, f_f, f_g, f_h, f_no_dict, 
f_compressed;
+  auto name_gen = Column::NameGenerator();
 
-  a = gen.Int16(kBatchSize, -10000, 10000, kNullProbability);
-  f_a = field("a", a->type());
+  auto field_for_array_named = [&](const std::shared_ptr<Array>& array,
+                                   std::string name) {
+    return field(std::move(name), array->type(), 
/*nullable=*/array->null_count() != 0);
+  };
+  auto field_for_array = [&](const std::shared_ptr<Array>& array) {
+    return field_for_array_named(array, name_gen());
+  };
 
-  b = gen.Float64(kBatchSize, -1e10, 1e10, /*null_probability=*/0.0);
-  f_b = field("b", b->type());
+  std::vector<Column> columns;
+
+  auto int16_array = gen.Int16(kBatchSize, -30000, 30000, kNullProbability);
+  auto int32_array = gen.Int32(kBatchSize, -2000000000, 2000000000, 
kNullProbability);
+  auto int64_array = gen.Int64(kBatchSize, -9000000000000000000LL, 
9000000000000000000LL,
+                               kNullProbability);
+  auto non_null_float64_array =
+      gen.Float64(kBatchSize, -1e10, 1e10, /*null_probability=*/0.0);
+  auto tiny_strings_array = gen.String(kBatchSize, 0, 3, kNullProbability);
+  auto large_strings_array =
+      gen.LargeString(kBatchSize, /*min_length=*/0, /*max_length=*/20, 
kNullProbability);
+  auto string_view_array =
+      gen.StringView(kBatchSize, /*min_length=*/8, /*max_length=*/30, 
kNullProbability);
+  ARROW_ASSIGN_OR_RAISE(auto null_array, MakeArrayOfNull(null(), kBatchSize));
+
+  // Null
+  columns.push_back({name_gen(), null_array});
+  // Numerics
+  columns.push_back({name_gen(), int16_array});
+  columns.push_back({name_gen(), non_null_float64_array});
+  columns.push_back(
+      {name_gen(), gen.Float16(kBatchSize, Float16::FromDouble(-1e4),
+                               Float16::FromDouble(1e4), kNullProbability)});
+  columns.push_back({name_gen(), int64_array});
+  // Decimals
+  columns.push_back(
+      {name_gen(), gen.Decimal128(decimal128(24, 7), kBatchSize, 
kNullProbability)});
+  columns.push_back(
+      {name_gen(), gen.Decimal256(decimal256(43, 7), kBatchSize, 
kNullProbability)});
+  columns.push_back(
+      {name_gen(), gen.Decimal64(decimal64(12, 3), kBatchSize, 
kNullProbability)});
+  columns.push_back(
+      {name_gen(), gen.Decimal32(decimal32(7, 3), kBatchSize, 
kNullProbability)});
+
+  // Timestamp
+  for (auto unit : TimeUnit::values()) {
+    ARROW_ASSIGN_OR_RAISE(auto timestamps, int64_array->View(timestamp(unit, 
"UTC")));
+    columns.push_back({name_gen(), timestamps});
+  }
+  // Time32, time64
+  ARROW_ASSIGN_OR_RAISE(
+      auto time32_s,
+      gen.Int32(kBatchSize, 0, 86399, 
kNullProbability)->View(time32(TimeUnit::SECOND)));
+  columns.push_back({name_gen(), time32_s});
+  ARROW_ASSIGN_OR_RAISE(auto time32_ms,
+                        gen.Int32(kBatchSize, 0, 86399999, kNullProbability)
+                            ->View(time32(TimeUnit::MILLI)));
+  columns.push_back({name_gen(), time32_ms});
+  ARROW_ASSIGN_OR_RAISE(auto time64_us,
+                        gen.Int64(kBatchSize, 0, 86399999999LL, 
kNullProbability)
+                            ->View(time64(TimeUnit::MICRO)));
+  columns.push_back({name_gen(), time64_us});
+  ARROW_ASSIGN_OR_RAISE(auto time64_ns,
+                        gen.Int64(kBatchSize, 0, 86399999999999LL, 
kNullProbability)
+                            ->View(time64(TimeUnit::NANO)));
+  columns.push_back({name_gen(), time64_ns});
+  // Date32, date64
+  ARROW_ASSIGN_OR_RAISE(
+      auto date32_array,
+      gen.Int32(kBatchSize, -1000 * 365, 1000 * 365, 
kNullProbability)->View(date32()));
+  columns.push_back({name_gen(), date32_array});
+  columns.push_back(
+      {name_gen(), gen.Date64(kBatchSize, -1000 * 365, 1000 * 365, 
kNullProbability)});
 
   // A column of tiny strings that will hopefully trigger dict encoding
-  c = gen.String(kBatchSize, 0, 3, kNullProbability);
-  f_c = field("c", c->type());
+  columns.push_back({name_gen(), tiny_strings_array});
+  columns.push_back({name_gen(), large_strings_array});
+  columns.push_back({name_gen(), string_view_array});
+  columns.push_back(
+      {name_gen(), gen.FixedSizeBinary(kBatchSize, /*byte_width=*/7, 
kNullProbability)});
 
-  // A column of lists
+  // A column of lists/large lists
   {
     auto values = gen.Int64(kBatchSize * 10, -10000, 10000, kNullProbability);
     auto offsets = gen.Offsets(kBatchSize + 1, 0, 
static_cast<int32_t>(values->length()));
-    ARROW_ASSIGN_OR_RAISE(d, ListArray::FromArrays(*offsets, *values));
+    ARROW_ASSIGN_OR_RAISE(auto lists, ListArray::FromArrays(*offsets, 
*values));
+    columns.push_back({name_gen(), lists});
+    auto large_offsets = gen.LargeOffsets(kBatchSize + 1, 0, values->length());
+    ARROW_ASSIGN_OR_RAISE(auto large_lists,
+                          LargeListArray::FromArrays(*large_offsets, *values));
+    columns.push_back({name_gen(), large_lists});
   }
-  f_d = field("d", d->type());
-
   // A column of a repeated constant that will hopefully trigger RLE encoding
-  ARROW_ASSIGN_OR_RAISE(e, MakeArrayFromScalar(Int16Scalar(42), kBatchSize));
-  f_e = field("e", e->type());
-
+  {
+    ARROW_ASSIGN_OR_RAISE(auto values, MakeArrayFromScalar(Int16Scalar(42), 
kBatchSize));
+    columns.push_back({name_gen(), values});
+  }
   // A column of lists of lists
   {
     auto inner_values = gen.Int64(kBatchSize * 9, -10000, 10000, 
kNullProbability);
@@ -92,51 +209,62 @@ Result<std::shared_ptr<RecordBatch>> ExampleBatch1() {
                           ListArray::FromArrays(*inner_offsets, 
*inner_values));
     auto offsets = gen.Offsets(
         kBatchSize + 1, 0, static_cast<int32_t>(inner_lists->length()), 
kNullProbability);
-    ARROW_ASSIGN_OR_RAISE(f, ListArray::FromArrays(*offsets, *inner_lists));
+    ARROW_ASSIGN_OR_RAISE(auto lists, ListArray::FromArrays(*offsets, 
*inner_lists));
+    columns.push_back({name_gen(), lists});
+  }
+  // A column of maps
+  {
+    constexpr auto kChildSize = kBatchSize * 3;
+    auto keys = gen.String(kChildSize, /*min_length=*/4, /*max_length=*/7,
+                           /*null_probability=*/0);
+    auto values = gen.Float32(kChildSize, -1e10, 1e10, kNullProbability);
+    columns.push_back({name_gen(), gen.Map(keys, values, kBatchSize, 
kNullProbability)});
   }
-  f_f = field("f", f->type());
-
   // A column of nested non-nullable structs
   {
-    ARROW_ASSIGN_OR_RAISE(
-        auto inner_a,
-        StructArray::Make({a, b}, std::vector<std::string>{"inner1_aa", 
"inner1_ab"}));
-    ARROW_ASSIGN_OR_RAISE(
-        g, StructArray::Make({inner_a, c},
-                             {field("inner1_a", inner_a->type(), 
/*nullable=*/false),
-                              field("inner1_c", c->type())}));
+    ARROW_ASSIGN_OR_RAISE(auto inner_a,
+                          StructArray::Make({int16_array, 
non_null_float64_array},
+                                            {field_for_array(int16_array),
+                                             
field_for_array(non_null_float64_array)}));
+    ARROW_ASSIGN_OR_RAISE(auto structs,
+                          StructArray::Make({inner_a, tiny_strings_array},
+                                            {field_for_array(inner_a),
+                                             
field_for_array(tiny_strings_array)}));
+    columns.push_back({name_gen(), structs});
   }
-  f_g = field("g", g->type(), /*nullable=*/false);
-
   // A column of nested nullable structs
   {
     auto null_bitmap = gen.NullBitmap(kBatchSize, kNullProbability);
-    ARROW_ASSIGN_OR_RAISE(
-        auto inner_a,
-        StructArray::Make({a, b}, std::vector<std::string>{"inner2_aa", 
"inner2_ab"},
-                          std::move(null_bitmap)));
+    ARROW_ASSIGN_OR_RAISE(auto inner_a,
+                          StructArray::Make({int16_array, 
non_null_float64_array},
+                                            {field_for_array(int16_array),
+                                             
field_for_array(non_null_float64_array)},
+                                            std::move(null_bitmap)));
     null_bitmap = gen.NullBitmap(kBatchSize, kNullProbability);
     ARROW_ASSIGN_OR_RAISE(
-        h,
-        StructArray::Make({inner_a, c}, std::vector<std::string>{"inner2_a", 
"inner2_c"},
+        auto structs,
+        StructArray::Make({inner_a, tiny_strings_array},
+                          {field_for_array(inner_a), 
field_for_array(tiny_strings_array)},
                           std::move(null_bitmap)));
+    columns.push_back({name_gen(), structs});
   }
-  f_h = field("h", h->type());
 
-  // A non-dict-encoded column (see GetWriterProperties)
-  no_dict = gen.String(kBatchSize, 0, 30, kNullProbability);
-  f_no_dict = field("no_dict", no_dict->type());
+  // TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
 
-  // A non-dict-encoded column (see GetWriterProperties)
-  compressed = gen.Int64(kBatchSize, -10, 10, kNullProbability);
-  f_compressed = field("compressed", compressed->type());
+  // A non-dict-encoded column (see GetWriteConfigurations)
+  columns.push_back({"no_dict", gen.String(kBatchSize, 0, 30, 
kNullProbability)});
+  // A column that should be quite compressible (see GetWriteConfigurations)
+  columns.push_back({"compressed", gen.Int64(kBatchSize, -10, 10, 
kNullProbability)});
 
-  auto schema =
-      ::arrow::schema({f_a, f_b, f_c, f_d, f_e, f_f, f_g, f_h, f_compressed, 
f_no_dict});
+  FieldVector fields;
+  ArrayVector arrays;
+  for (const auto& col : columns) {
+    fields.push_back(field_for_array_named(col.array, col.name));
+    arrays.push_back(col.array);
+  }
   auto md = key_value_metadata({"key1", "key2"}, {"value1", ""});
-  schema = schema->WithMetadata(md);
-  return RecordBatch::Make(schema, kBatchSize,
-                           {a, b, c, d, e, f, g, h, compressed, no_dict});
+  auto schema = ::arrow::schema(std::move(fields), std::move(md));
+  return RecordBatch::Make(std::move(schema), kBatchSize, std::move(arrays));
 }
 
 Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
@@ -157,18 +285,21 @@ Status DoMain(const std::string& out_dir) {
 
   ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
 
-  auto writer_properties = GetWriterProperties();
+  auto write_configs = GetWriteConfigurations();
 
   for (const auto& batch : batches) {
     RETURN_NOT_OK(batch->ValidateFull());
     ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches({batch}));
 
-    ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
-    std::cerr << sample_fn.ToString() << std::endl;
-    ARROW_ASSIGN_OR_RAISE(auto file, 
io::FileOutputStream::Open(sample_fn.ToString()));
-    RETURN_NOT_OK(::parquet::arrow::WriteTable(*table, default_memory_pool(), 
file,
-                                               kChunkSize, writer_properties));
-    RETURN_NOT_OK(file->Close());
+    for (const auto& config : write_configs) {
+      ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
+      std::cerr << sample_fn.ToString() << std::endl;
+      ARROW_ASSIGN_OR_RAISE(auto file, 
io::FileOutputStream::Open(sample_fn.ToString()));
+      RETURN_NOT_OK(::parquet::arrow::WriteTable(*table, 
default_memory_pool(), file,
+                                                 kChunkSize, 
config.writer_properties,
+                                                 
config.arrow_writer_properties));
+      RETURN_NOT_OK(file->Close());
+    }
   }
   return Status::OK();
 }

(arrow) branch main updated: GH-47597: [C++][Parquet] Fuzz more data types (#47621)

Reply via email to