This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-15.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 0969975dda352059eb5f830c5368db4144ebf8ed
Author: Weston Pace <[email protected]>
AuthorDate: Mon Feb 26 08:53:35 2024 -0800

    GH-40068:  [C++] Possible data race when reading metadata of a parquet file 
(#40111)
    
    ### Rationale for this change
    
    The `ParquetFileFragment` will cache the parquet metadata when loading it.  
The `metadata()` method accesses this metadata (a shared_ptr) but does not grab 
the lock used to set that shared_ptr.  It's possible then that we are reading a 
shared_ptr at the same time some other thread is setting the shared_ptr which 
is technically (I think) undefined behavior.
    
    ### What changes are included in this PR?
    
    Guard access to the metadata by grabbing the mutex first
    
    ### Are these changes tested?
    
    Existing tests should regress this change
    
    ### Are there any user-facing changes?
    
    No
    * Closes: #40068
    
    Authored-by: Weston Pace <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/dataset/file_parquet.cc | 5 +++++
 cpp/src/arrow/dataset/file_parquet.h  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc 
b/cpp/src/arrow/dataset/file_parquet.cc
index 140917a2e6..c17ba89be7 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -779,6 +779,11 @@ ParquetFileFragment::ParquetFileFragment(FileSource source,
       parquet_format_(checked_cast<ParquetFileFormat&>(*format_)),
       row_groups_(std::move(row_groups)) {}
 
+std::shared_ptr<parquet::FileMetaData> ParquetFileFragment::metadata() {
+  auto lock = physical_schema_mutex_.Lock();
+  return metadata_;
+}
+
 Status ParquetFileFragment::EnsureCompleteMetadata(parquet::arrow::FileReader* 
reader) {
   auto lock = physical_schema_mutex_.Lock();
   if (metadata_ != nullptr) {
diff --git a/cpp/src/arrow/dataset/file_parquet.h 
b/cpp/src/arrow/dataset/file_parquet.h
index 5141f36385..63d8fd7292 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -165,7 +165,7 @@ class ARROW_DS_EXPORT ParquetFileFragment : public 
FileFragment {
   }
 
   /// \brief Return the FileMetaData associated with this fragment.
-  const std::shared_ptr<parquet::FileMetaData>& metadata() const { return 
metadata_; }
+  std::shared_ptr<parquet::FileMetaData> metadata();
 
   /// \brief Ensure this fragment's FileMetaData is in memory.
   Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);

Reply via email to