This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f518d6beb0 GH-38041: [C++][CI] Improve IPC fuzzing seed corpus (#43621)
f518d6beb0 is described below

commit f518d6beb0c70f00688d08a3e70deff0d3c24c86
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Aug 15 10:41:08 2024 +0200

    GH-38041: [C++][CI] Improve IPC fuzzing seed corpus (#43621)
    
    1. Add fuzz seeds with newer datatypes such as Run-End Encoded and String 
Views
    2. Add fuzz seeds with buffer compression
    3. Build seed corpus generation utilities even when fuzzing isn't enabled, 
for convenience
    
    * GitHub Issue: #38041
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/ipc/CMakeLists.txt                 |  7 +++-
 cpp/src/arrow/ipc/generate_fuzz_corpus.cc        | 44 +++++++++++++++++-------
 cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc |  2 +-
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index 2fc9b145cc..9e0b1d723b 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -71,7 +71,12 @@ endif()
 
 add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc")
 
-if(ARROW_FUZZING)
+if(ARROW_FUZZING
+   OR (ARROW_BUILD_UTILITIES
+       AND ARROW_TESTING
+       AND ARROW_WITH_LZ4
+       AND ARROW_WITH_ZSTD
+      ))
   add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
   target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
                         ${ARROW_TEST_LINK_LIBS})
diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc 
b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
index 682c352132..6ccf1155d1 100644
--- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
@@ -33,11 +33,11 @@
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 #include "arrow/testing/extension_type.h"
+#include "arrow/util/compression.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/key_value_metadata.h"
 
-namespace arrow {
-namespace ipc {
+namespace arrow::ipc {
 
 using ::arrow::internal::CreateDir;
 using ::arrow::internal::PlatformFilename;
@@ -88,6 +88,13 @@ Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
   batches.push_back(batch);
   RETURN_NOT_OK(test::MakeFixedSizeListRecordBatch(&batch));
   batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeStringTypesRecordBatch(&batch));
+  batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeUuid(&batch));
+  batches.push_back(batch);
+  RETURN_NOT_OK(test::MakeRunEndEncoded(&batch));
+  batches.push_back(batch);
+
   ARROW_ASSIGN_OR_RAISE(batch, MakeExtensionBatch());
   batches.push_back(batch);
   ARROW_ASSIGN_OR_RAISE(batch, MakeMapBatch());
@@ -97,13 +104,14 @@ Result<std::vector<std::shared_ptr<RecordBatch>>> 
Batches() {
 }
 
 Result<std::shared_ptr<Buffer>> SerializeRecordBatch(
-    const std::shared_ptr<RecordBatch>& batch, bool is_stream_format) {
+    const std::shared_ptr<RecordBatch>& batch, const IpcWriteOptions& options,
+    bool is_stream_format) {
   ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
   std::shared_ptr<RecordBatchWriter> writer;
   if (is_stream_format) {
-    ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema()));
+    ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema(), 
options));
   } else {
-    ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema()));
+    ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema(), 
options));
   }
   RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
   RETURN_NOT_OK(writer->Close());
@@ -119,16 +127,27 @@ Status DoMain(bool is_stream_format, const std::string& 
out_dir) {
     return "batch-" + std::to_string(sample_num++);
   };
 
+  // codec 0 is uncompressed
+  std::vector<std::shared_ptr<util::Codec>> codecs(3, nullptr);
+  ARROW_ASSIGN_OR_RAISE(codecs[1], 
util::Codec::Create(Compression::LZ4_FRAME));
+  ARROW_ASSIGN_OR_RAISE(codecs[2], util::Codec::Create(Compression::ZSTD));
+
   ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
 
+  // Emit a separate file for each (batch, codec) pair
   for (const auto& batch : batches) {
     RETURN_NOT_OK(batch->ValidateFull());
-    ARROW_ASSIGN_OR_RAISE(auto buf, SerializeRecordBatch(batch, 
is_stream_format));
-    ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
-    std::cerr << sample_fn.ToString() << std::endl;
-    ARROW_ASSIGN_OR_RAISE(auto file, 
io::FileOutputStream::Open(sample_fn.ToString()));
-    RETURN_NOT_OK(file->Write(buf));
-    RETURN_NOT_OK(file->Close());
+    for (const auto& codec : codecs) {
+      IpcWriteOptions options = IpcWriteOptions::Defaults();
+      options.codec = codec;
+      ARROW_ASSIGN_OR_RAISE(auto buf,
+                            SerializeRecordBatch(batch, options, 
is_stream_format));
+      ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
+      std::cerr << sample_fn.ToString() << std::endl;
+      ARROW_ASSIGN_OR_RAISE(auto file, 
io::FileOutputStream::Open(sample_fn.ToString()));
+      RETURN_NOT_OK(file->Write(buf));
+      RETURN_NOT_OK(file->Close());
+    }
   }
   return Status::OK();
 }
@@ -157,7 +176,6 @@ int Main(int argc, char** argv) {
   return 0;
 }
 
-}  // namespace ipc
-}  // namespace arrow
+}  // namespace arrow::ipc
 
 int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); }
diff --git a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc 
b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
index dd40ef0ab2..870f458670 100644
--- a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
@@ -41,7 +41,7 @@ using ::arrow::internal::PlatformFilename;
 Result<PlatformFilename> PrepareDirectory(const std::string& dir) {
   ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(dir));
   RETURN_NOT_OK(::arrow::internal::CreateDir(dir_fn));
-  return std::move(dir_fn);
+  return dir_fn;
 }
 
 Result<std::shared_ptr<Buffer>> MakeSerializedBuffer(

Reply via email to