This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e4f31462db GH-41317: [C++] Fix crash on invalid Parquet file (#41366)
e4f31462db is described below
commit e4f31462dbd668c3bcb6ce96442f3c1632c4d8c8
Author: Even Rouault <[email protected]>
AuthorDate: Tue Apr 30 06:38:40 2024 +0200
GH-41317: [C++] Fix crash on invalid Parquet file (#41366)
### Rationale for this change
Fixes the crash detailed in #41317 in TableBatchReader::ReadNext() on a
corrupted Parquet file
### What changes are included in this PR?
Add a validation that all read columns have the same size
### Are these changes tested?
I've tested on the reproducer I provided in #41317 that it now triggers a
clean error:
```
Traceback (most recent call last):
File "test.py", line 3, in <module>
[_ for _ in parquet_file.iter_batches()]
File "test.py", line 3, in <listcomp>
[_ for _ in parquet_file.iter_batches()]
File "pyarrow/_parquet.pyx", line 1587, in iter_batches
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: columns do not have the same size
```
I'm not sure if/how unit tests for corrupted datasets should be added
### Are there any user-facing changes?
No
**This PR contains a "Critical Fix".**
* GitHub Issue: #41317
Authored-by: Even Rouault <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/arrow/table.cc | 2 ++
cpp/src/arrow/table.h | 2 ++
cpp/src/parquet/arrow/reader.cc | 10 ++++++++++
3 files changed, 14 insertions(+)
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 967e78f6b4..5dc5e4c1a9 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table)
for (int i = 0; i < table.num_columns(); ++i) {
column_data_[i] = table.column(i).get();
}
+ DCHECK(table_.Validate().ok());
}
TableBatchReader::TableBatchReader(std::shared_ptr<Table> table)
@@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr<Table>
table)
for (int i = 0; i < owned_table_->num_columns(); ++i) {
column_data_[i] = owned_table_->column(i).get();
}
+ DCHECK(table_.Validate().ok());
}
std::shared_ptr<Schema> TableBatchReader::schema() const { return
table_.schema(); }
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index a7508430c1..79675fa92b 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -241,6 +241,8 @@ class ARROW_EXPORT Table {
///
/// The conversion is zero-copy: each record batch is a view over a slice
/// of the table's columns.
+///
+/// The table is expected to be valid prior to using it with the batch reader.
class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
public:
/// \brief Construct a TableBatchReader for the given table
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index d6ad7c25bc..285e2a5973 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const
std::vector<int>& row_groups,
}
}
+ // Check all columns has same row-size
+ if (!columns.empty()) {
+ int64_t row_size = columns[0]->length();
+ for (size_t i = 1; i < columns.size(); ++i) {
+ if (columns[i]->length() != row_size) {
+ return ::arrow::Status::Invalid("columns do not have the same
size");
+ }
+ }
+ }
+
auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
auto table_reader =
std::make_shared<::arrow::TableBatchReader>(*table);