This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f518d6beb0 GH-38041: [C++][CI] Improve IPC fuzzing seed corpus (#43621)
f518d6beb0 is described below
commit f518d6beb0c70f00688d08a3e70deff0d3c24c86
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Aug 15 10:41:08 2024 +0200
GH-38041: [C++][CI] Improve IPC fuzzing seed corpus (#43621)
1. Add fuzz seeds with newer datatypes such as Run-End Encoded and String
Views
2. Add fuzz seeds with buffer compression
3. Build seed corpus generation utilities even when fuzzing isn't enabled,
for convenience
* GitHub Issue: #38041
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/ipc/CMakeLists.txt | 7 +++-
cpp/src/arrow/ipc/generate_fuzz_corpus.cc | 44 +++++++++++++++++-------
cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc | 2 +-
3 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index 2fc9b145cc..9e0b1d723b 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -71,7 +71,12 @@ endif()
add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc")
-if(ARROW_FUZZING)
+if(ARROW_FUZZING
+ OR (ARROW_BUILD_UTILITIES
+ AND ARROW_TESTING
+ AND ARROW_WITH_LZ4
+ AND ARROW_WITH_ZSTD
+ ))
add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
${ARROW_TEST_LINK_LIBS})
diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
index 682c352132..6ccf1155d1 100644
--- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc
@@ -33,11 +33,11 @@
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/testing/extension_type.h"
+#include "arrow/util/compression.h"
#include "arrow/util/io_util.h"
#include "arrow/util/key_value_metadata.h"
-namespace arrow {
-namespace ipc {
+namespace arrow::ipc {
using ::arrow::internal::CreateDir;
using ::arrow::internal::PlatformFilename;
@@ -88,6 +88,13 @@ Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
batches.push_back(batch);
RETURN_NOT_OK(test::MakeFixedSizeListRecordBatch(&batch));
batches.push_back(batch);
+ RETURN_NOT_OK(test::MakeStringTypesRecordBatch(&batch));
+ batches.push_back(batch);
+ RETURN_NOT_OK(test::MakeUuid(&batch));
+ batches.push_back(batch);
+ RETURN_NOT_OK(test::MakeRunEndEncoded(&batch));
+ batches.push_back(batch);
+
ARROW_ASSIGN_OR_RAISE(batch, MakeExtensionBatch());
batches.push_back(batch);
ARROW_ASSIGN_OR_RAISE(batch, MakeMapBatch());
@@ -97,13 +104,14 @@ Result<std::vector<std::shared_ptr<RecordBatch>>>
Batches() {
}
Result<std::shared_ptr<Buffer>> SerializeRecordBatch(
- const std::shared_ptr<RecordBatch>& batch, bool is_stream_format) {
+ const std::shared_ptr<RecordBatch>& batch, const IpcWriteOptions& options,
+ bool is_stream_format) {
ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
std::shared_ptr<RecordBatchWriter> writer;
if (is_stream_format) {
- ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema()));
+ ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema(),
options));
} else {
- ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema()));
+ ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema(),
options));
}
RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
RETURN_NOT_OK(writer->Close());
@@ -119,16 +127,27 @@ Status DoMain(bool is_stream_format, const std::string&
out_dir) {
return "batch-" + std::to_string(sample_num++);
};
+ // codec 0 is uncompressed
+ std::vector<std::shared_ptr<util::Codec>> codecs(3, nullptr);
+ ARROW_ASSIGN_OR_RAISE(codecs[1],
util::Codec::Create(Compression::LZ4_FRAME));
+ ARROW_ASSIGN_OR_RAISE(codecs[2], util::Codec::Create(Compression::ZSTD));
+
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
+ // Emit a separate file for each (batch, codec) pair
for (const auto& batch : batches) {
RETURN_NOT_OK(batch->ValidateFull());
- ARROW_ASSIGN_OR_RAISE(auto buf, SerializeRecordBatch(batch,
is_stream_format));
- ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
- std::cerr << sample_fn.ToString() << std::endl;
- ARROW_ASSIGN_OR_RAISE(auto file,
io::FileOutputStream::Open(sample_fn.ToString()));
- RETURN_NOT_OK(file->Write(buf));
- RETURN_NOT_OK(file->Close());
+ for (const auto& codec : codecs) {
+ IpcWriteOptions options = IpcWriteOptions::Defaults();
+ options.codec = codec;
+ ARROW_ASSIGN_OR_RAISE(auto buf,
+ SerializeRecordBatch(batch, options,
is_stream_format));
+ ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
+ std::cerr << sample_fn.ToString() << std::endl;
+ ARROW_ASSIGN_OR_RAISE(auto file,
io::FileOutputStream::Open(sample_fn.ToString()));
+ RETURN_NOT_OK(file->Write(buf));
+ RETURN_NOT_OK(file->Close());
+ }
}
return Status::OK();
}
@@ -157,7 +176,6 @@ int Main(int argc, char** argv) {
return 0;
}
-} // namespace ipc
-} // namespace arrow
+} // namespace arrow::ipc
int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); }
diff --git a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
index dd40ef0ab2..870f458670 100644
--- a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
+++ b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
@@ -41,7 +41,7 @@ using ::arrow::internal::PlatformFilename;
Result<PlatformFilename> PrepareDirectory(const std::string& dir) {
ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(dir));
RETURN_NOT_OK(::arrow::internal::CreateDir(dir_fn));
- return std::move(dir_fn);
+ return dir_fn;
}
Result<std::shared_ptr<Buffer>> MakeSerializedBuffer(