This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new cb61dfe217 GH-48691: [C++][Parquet] Write serializer may crash if the
value buffer is empty (#48692)
cb61dfe217 is described below
commit cb61dfe217872f64d0e7839eb34ca9bcb37f2f84
Author: Rex(Hui) An <[email protected]>
AuthorDate: Tue Jan 27 17:44:07 2026 +0800
GH-48691: [C++][Parquet] Write serializer may crash if the value buffer is
empty (#48692)
### Rationale for this change
WriteArrowSerialize could unconditionally read values from the Arrow array
even for null rows. Since it's possible the caller could provided a zero-sized
dummy buffer for all-null arrays, this caused an ASAN heap-buffer-overflow.
### What changes are included in this PR?
Early check the array is not all null values before serialize it
### Are these changes tested?
Added tests.
### Are there any user-facing changes?
No
* GitHub Issue: #48691
Authored-by: rexan <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 29 +++++++++++++++++++++++
cpp/src/parquet/column_writer.cc | 7 +++++-
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index edb59d9de3..29cc5678e4 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5889,5 +5889,34 @@ TEST(TestArrowReadWrite, OperationsOnClosedWriter) {
ASSERT_RAISES(Invalid, writer->WriteTable(*table, 1));
}
+TEST(TestArrowReadWrite, AllNulls) {
+ auto schema = ::arrow::schema({::arrow::field("all_nulls",
::arrow::int8())});
+
+ constexpr int64_t length = 3;
+ ASSERT_OK_AND_ASSIGN(auto null_bitmap, ::arrow::AllocateEmptyBitmap(length));
+ auto array_data = ::arrow::ArrayData::Make(
+ ::arrow::int8(), length, {null_bitmap, /*values=*/nullptr},
/*null_count=*/length);
+ auto array = ::arrow::MakeArray(array_data);
+ auto record_batch = ::arrow::RecordBatch::Make(schema, length, {array});
+
+ auto sink = CreateOutputStream();
+ ASSERT_OK_AND_ASSIGN(auto writer, parquet::arrow::FileWriter::Open(
+ *schema,
::arrow::default_memory_pool(), sink,
+ parquet::default_writer_properties(),
+
parquet::default_arrow_writer_properties()));
+ ASSERT_OK(writer->WriteRecordBatch(*record_batch));
+ ASSERT_OK(writer->Close());
+ ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+ std::shared_ptr<::arrow::Table> read_table;
+ ASSERT_OK_AND_ASSIGN(auto reader,
+
parquet::arrow::OpenFile(std::make_shared<BufferReader>(buffer),
+
::arrow::default_memory_pool()));
+ ASSERT_OK(reader->ReadTable(&read_table));
+ auto expected_table = ::arrow::Table::Make(
+ schema, {::arrow::ArrayFromJSON(::arrow::int8(), R"([null, null,
null])")});
+ ASSERT_TRUE(expected_table->Equals(*read_table));
+}
+
} // namespace arrow
} // namespace parquet
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 20b8cc98ca..797d435e73 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2099,7 +2099,12 @@ Status
TypedColumnWriterImpl<ParquetType>::WriteArrowSerialize(
PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(),
&buffer));
SerializeFunctor<ParquetType, ArrowType> functor;
- RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx,
buffer));
+ // The value buffer could be empty if all values are nulls.
+ // The output buffer will then remain uninitialized, but that's ok since
+ // null value slots are not written in Parquet.
+ if (array.null_count() != array.length()) {
+ RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array),
ctx, buffer));
+ }
bool no_nulls =
this->descr()->schema_node()->is_required() || (array.null_count() == 0);
if (!maybe_parent_nulls && no_nulls) {