mapleFU opened a new issue, #35519:
URL: https://github.com/apache/arrow/issues/35519

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   When closing `RowGroupSerializer` in Parquet, the program segfault on:
   
   ```
   CheckRowsWritten
   - RowGroupSerializer::num_rows
   -- RowGroupSerializer::Close
   --- FileSerializer::Close
   ---- ~FileSerializer
   ``` 
   
   And after some analyze, I found that the bug is due to exception handling. 
When calling `FileSerializer::AppendRowGroup`, the `RowGroupSerializer::Close` 
is called. So, it goes to loop:
   
   ```c++
     void Close() override {
       if (!closed_) {
         closed_ = true;
         CheckRowsWritten();
   
         for (size_t i = 0; i < column_writers_.size(); i++) {
           if (column_writers_[i]) {
             total_bytes_written_ += column_writers_[i]->Close();
             total_compressed_bytes_written_ +=
                 column_writers_[i]->total_compressed_bytes_written();
             column_writers_[i].reset();
           }
         }
   
         column_writers_.clear();
   
         // Ensures all columns have been written
         metadata_->set_num_rows(num_rows_);
         metadata_->Finish(total_bytes_written_, row_group_ordinal_);
       }
     }
   ```
   
   When calling `column_writers_[i]->Close()` in `buffered mode`, exception is 
thrown. So, assume there are 3 columns, and throw exception on the second one, 
`column_writers_` would be:
   
   ```
   nullptr
   non-null
   non-null
   ```
   
   Thats ok, when exception propagated, it will call `FileSerializer::Close`:
   
   ```c++
     void Close() override {
       if (is_open_) {
         // If any functions here raise an exception, we set is_open_ to be 
false
         // so that this does not get called again (possibly causing segfault)
         is_open_ = false;
         if (row_group_writer_) {
           num_rows_ += row_group_writer_->num_rows();
           row_group_writer_->Close();
         }
         row_group_writer_.reset();
   
         WritePageIndex();
   
         // Write magic bytes and metadata
         auto file_encryption_properties = 
properties_->file_encryption_properties();
   
         if (file_encryption_properties == nullptr) {  // Non encrypted file.
           file_metadata_ = metadata_->Finish(key_value_metadata_);
           WriteFileMetaData(*file_metadata_, sink_.get());
         } else {  // Encrypted file
           CloseEncryptedFile(file_encryption_properties);
         }
       }
     }
   ```
   
   The code looks good, however, this time, `row_group_writer_ != nullptr`, so 
`row_group_writer_->num_rows()` is called:
   
   ```c++
     int64_t num_rows() const override {
       CheckRowsWritten();
       // CheckRowsWritten ensures num_rows_ is set correctly
       return num_rows_;
     }
   
     void CheckRowsWritten() const {
       // verify when only one column is written at a time
       if (!buffered_row_group_ && column_writers_.size() > 0 && 
column_writers_[0]) {
         int64_t current_col_rows = column_writers_[0]->rows_written();
         if (num_rows_ == 0) {
           num_rows_ = current_col_rows;
         } else if (num_rows_ != current_col_rows) {
           ThrowRowsMisMatchError(next_column_index_, current_col_rows, 
num_rows_);
         }
       } else if (buffered_row_group_ &&
                  column_writers_.size() > 0) {  // when buffered_row_group = 
true
         int64_t current_col_rows = column_writers_[0]->rows_written();
         for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
           int64_t current_col_rows_i = column_writers_[i]->rows_written();
           if (current_col_rows != current_col_rows_i) {
             ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
           }
         }
         num_rows_ = current_col_rows;
       }
     }
   ```
   
   This time, it will call `column_writers_[i]->...`, which is a nullptr call, 
causing segfault.
   
   ### Component(s)
   
   C++, Parquet


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to