This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fcf9dd6aa5 GH-49385: Clarify empty schema contract on stream_reader
(#49386)
fcf9dd6aa5 is described below
commit fcf9dd6aa50731c2b7d83e495ecbc04b002968d6
Author: emkornfield <[email protected]>
AuthorDate: Wed Mar 4 11:14:40 2026 -0800
GH-49385: Clarify empty schema contract on stream_reader (#49386)
### Rationale for this change
StreamReader inherently does not support empty schemas. Guard this case
with an exception.
### What changes are included in this PR?
Added validation around the parquet reader passed in.
### Are these changes tested?
Yes added unit tests.
### Are there any user-facing changes?
A change that might be debatable is the constructor for this class can
now throw, but it was never marked noexcept.
**This PR contains a "Critical Fix".**
* GitHub Issue: #49385
---
cpp/src/parquet/stream_reader.cc | 3 +++
cpp/src/parquet/stream_reader.h | 1 +
cpp/src/parquet/stream_reader_test.cc | 18 ++++++++++++++++++
3 files changed, 22 insertions(+)
diff --git a/cpp/src/parquet/stream_reader.cc b/cpp/src/parquet/stream_reader.cc
index d3353aa334..6a8dfa8f63 100644
--- a/cpp/src/parquet/stream_reader.cc
+++ b/cpp/src/parquet/stream_reader.cc
@@ -50,6 +50,9 @@ StreamReader::StreamReader(std::unique_ptr<ParquetFileReader>
reader)
auto schema = file_metadata_->schema();
auto group_node = schema->group_node();
+ if (schema->num_columns() == 0) {
+ throw ParquetException("StreamReader does not support empty schemas.");
+ }
nodes_.resize(schema->num_columns());
for (auto i = 0; i < schema->num_columns(); ++i) {
diff --git a/cpp/src/parquet/stream_reader.h b/cpp/src/parquet/stream_reader.h
index a7dadac92c..a5f6e534d8 100644
--- a/cpp/src/parquet/stream_reader.h
+++ b/cpp/src/parquet/stream_reader.h
@@ -65,6 +65,7 @@ class PARQUET_EXPORT StreamReader {
// assigned afterwards.
StreamReader() = default;
+ /// Reader must have at least one field defined in its schema.
explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
~StreamReader() = default;
diff --git a/cpp/src/parquet/stream_reader_test.cc
b/cpp/src/parquet/stream_reader_test.cc
index 04140f6ad0..8db21fb9e8 100644
--- a/cpp/src/parquet/stream_reader_test.cc
+++ b/cpp/src/parquet/stream_reader_test.cc
@@ -24,8 +24,10 @@
#include <memory>
#include "arrow/io/file.h"
+#include "arrow/io/memory.h"
#include "arrow/util/decimal.h"
#include "parquet/exception.h"
+#include "parquet/file_writer.h"
#include "parquet/test_util.h"
namespace parquet {
@@ -251,6 +253,22 @@ TEST_F(TestStreamReader, DefaultConstructed) {
EXPECT_EQ(0, os.SkipRows(100));
}
+TEST(StreamReaderEmptySchema, ThrowsOnConstruction) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer_os,
::arrow::io::BufferOutputStream::Create());
+
+ auto empty_schema = std::static_pointer_cast<schema::GroupNode>(
+ schema::GroupNode::Make("schema", Repetition::REQUIRED,
schema::NodeVector{}));
+
+ auto file_writer = ParquetFileWriter::Open(buffer_os, empty_schema);
+ file_writer->Close();
+
+ PARQUET_ASSIGN_OR_THROW(auto buffer, buffer_os->Finish());
+ auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer);
+ auto file_reader = ParquetFileReader::Open(buffer_reader);
+
+ EXPECT_THROW(StreamReader{std::move(file_reader)}, ParquetException);
+}
+
TEST_F(TestStreamReader, TypeChecking) {
bool b;
std::string s;