This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new e4f31462db GH-41317: [C++] Fix crash on invalid Parquet file (#41366)
e4f31462db is described below

commit e4f31462dbd668c3bcb6ce96442f3c1632c4d8c8
Author: Even Rouault <[email protected]>
AuthorDate: Tue Apr 30 06:38:40 2024 +0200

    GH-41317: [C++] Fix crash on invalid Parquet file (#41366)
    
    ### Rationale for this change
    
    Fixes the crash detailed in #41317 in TableBatchReader::ReadNext() on a 
corrupted Parquet file
    
    ### What changes are included in this PR?
    
    Add a validation that all read columns have the same size
    
    ### Are these changes tested?
    
    I've tested on the reproducer I provided in #41317 that it now triggers a 
clean error:
    ```
    Traceback (most recent call last):
      File "test.py", line 3, in <module>
        [_ for _ in parquet_file.iter_batches()]
      File "test.py", line 3, in <listcomp>
        [_ for _ in parquet_file.iter_batches()]
      File "pyarrow/_parquet.pyx", line 1587, in iter_batches
      File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
    pyarrow.lib.ArrowInvalid: columns do not have the same size
    ```
    I'm not sure if/how unit tests for corrupted datasets should be added
    
    ### Are there any user-facing changes?
    
    No
    
    **This PR contains a "Critical Fix".**
    
    * GitHub Issue: #41317
    
    Authored-by: Even Rouault <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 cpp/src/arrow/table.cc          |  2 ++
 cpp/src/arrow/table.h           |  2 ++
 cpp/src/parquet/arrow/reader.cc | 10 ++++++++++
 3 files changed, 14 insertions(+)

diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 967e78f6b4..5dc5e4c1a9 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table)
   for (int i = 0; i < table.num_columns(); ++i) {
     column_data_[i] = table.column(i).get();
   }
+  DCHECK(table_.Validate().ok());
 }
 
 TableBatchReader::TableBatchReader(std::shared_ptr<Table> table)
@@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr<Table> 
table)
   for (int i = 0; i < owned_table_->num_columns(); ++i) {
     column_data_[i] = owned_table_->column(i).get();
   }
+  DCHECK(table_.Validate().ok());
 }
 
 std::shared_ptr<Schema> TableBatchReader::schema() const { return 
table_.schema(); }
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index a7508430c1..79675fa92b 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -241,6 +241,8 @@ class ARROW_EXPORT Table {
 ///
 /// The conversion is zero-copy: each record batch is a view over a slice
 /// of the table's columns.
+///
+/// The table is expected to be valid prior to using it with the batch reader.
 class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
  public:
   /// \brief Construct a TableBatchReader for the given table
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index d6ad7c25bc..285e2a5973 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const 
std::vector<int>& row_groups,
           }
         }
 
+        // Check all columns has same row-size
+        if (!columns.empty()) {
+          int64_t row_size = columns[0]->length();
+          for (size_t i = 1; i < columns.size(); ++i) {
+            if (columns[i]->length() != row_size) {
+              return ::arrow::Status::Invalid("columns do not have the same 
size");
+            }
+          }
+        }
+
         auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
         auto table_reader = 
std::make_shared<::arrow::TableBatchReader>(*table);
 

Reply via email to