This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 419ca89dbb GH-36628: [Python][Parquet] Fail when instantiating 
internal Parquet metadata classes (#45549)
419ca89dbb is described below

commit 419ca89dbb3df12dd9b3f1caae736d489bc2de2a
Author: Tien Nguyen <[email protected]>
AuthorDate: Mon Mar 10 11:38:16 2025 -0400

    GH-36628: [Python][Parquet] Fail when instantiating internal Parquet 
metadata classes (#45549)
    
    ### Rationale for this change
    As described in https://github.com/apache/arrow/issues/36628, we want to 
raise an exception instead of showing segfaults when users try to instantiate 
some internal Parquet metadata classes.
    
    ### What changes are included in this PR?
    This PR is a carry-on of https://github.com/apache/arrow/pull/44245. I 
modified a few things and covered more test cases. This PR includes these 
changes:
    * Raise an exception when users try to instantiate `Statistics`, 
`ParquetLogicalType`, `ColumnChunkMetaData`, `RowGroupMetaData`, `FileMetaData`
    
    ### Are these changes tested?
    Yes, added unit tests
    
    ### Are there any user-facing changes?
    Yes, after this change, users can't instantiate these classes anymore using 
the regular `__init__` workflow
    * GitHub Issue: #36628
    
    Authored-by: Tim Nguyen <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/_dataset_parquet.pyx           |  4 +--
 python/pyarrow/_parquet.pxd                   |  9 ++++++
 python/pyarrow/_parquet.pyx                   | 40 ++++++++++++++++++---------
 python/pyarrow/tests/parquet/test_metadata.py | 20 ++++++++++++++
 4 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/python/pyarrow/_dataset_parquet.pyx 
b/python/pyarrow/_dataset_parquet.pyx
index 863c928591..45db755d90 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -164,7 +164,7 @@ cdef class ParquetFileFormat(FileFormat):
             metadata = deref(
                 deref(parquet_file_writer).parquet_writer()).metadata()
         if metadata:
-            parquet_metadata = FileMetaData()
+            parquet_metadata = FileMetaData.__new__(FileMetaData)
             parquet_metadata.init(metadata)
             parquet_metadata.set_file_path(os.path.relpath(path, base_dir))
 
@@ -390,7 +390,7 @@ cdef class ParquetFileFragment(FileFragment):
     @property
     def metadata(self):
         self.ensure_complete_metadata()
-        cdef FileMetaData metadata = FileMetaData()
+        cdef FileMetaData metadata = FileMetaData.__new__(FileMetaData)
         metadata.init(self.parquet_file_fragment.metadata())
         return metadata
 
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index c17c3b70d7..1e3c89e4e7 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -631,6 +631,15 @@ cdef class RowGroupMetaData(_Weakrefable):
         CRowGroupMetaData* metadata
         FileMetaData parent
 
+    cdef inline init(self, FileMetaData parent, int index):
+        if index < 0 or index >= parent.num_row_groups:
+            raise IndexError('{0} out of bounds'.format(index))
+        self.up_metadata = parent._metadata.RowGroup(index)
+        self.metadata = self.up_metadata.get()
+        self.parent = parent
+        self.index = index
+
+
 cdef class ColumnChunkMetaData(_Weakrefable):
     cdef:
         unique_ptr[CColumnChunkMetaData] up_metadata
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 2fb1e41641..ecc4e57091 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -52,6 +52,9 @@ _MAX_ROW_GROUP_SIZE = 64*1024*1024
 cdef class Statistics(_Weakrefable):
     """Statistics for a single column in a single row group."""
 
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor 
directly")
+
     def __cinit__(self):
         pass
 
@@ -217,6 +220,10 @@ cdef class Statistics(_Weakrefable):
 
 cdef class ParquetLogicalType(_Weakrefable):
     """Logical type of parquet type."""
+
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor 
directly")
+
     cdef:
         shared_ptr[const CParquetLogicalType] type
 
@@ -253,7 +260,7 @@ cdef class ParquetLogicalType(_Weakrefable):
 
 
 cdef wrap_logical_type(const shared_ptr[const CParquetLogicalType]& type):
-    cdef ParquetLogicalType out = ParquetLogicalType()
+    cdef ParquetLogicalType out = 
ParquetLogicalType.__new__(ParquetLogicalType)
     out.init(type)
     return out
 
@@ -315,6 +322,9 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
 cdef class ColumnChunkMetaData(_Weakrefable):
     """Column metadata for a single row group."""
 
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor 
directly")
+
     def __cinit__(self):
         pass
 
@@ -436,7 +446,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
         """Statistics for column chunk (:class:`Statistics`)."""
         if not self.metadata.is_stats_set():
             return None
-        statistics = Statistics()
+        cdef Statistics statistics = Statistics.__new__(Statistics)
         statistics.init(self.metadata.statistics(), self)
         return statistics
 
@@ -739,13 +749,11 @@ cdef class SortingColumn:
 cdef class RowGroupMetaData(_Weakrefable):
     """Metadata for a single row group."""
 
-    def __cinit__(self, FileMetaData parent, int index):
-        if index < 0 or index >= parent.num_row_groups:
-            raise IndexError('{0} out of bounds'.format(index))
-        self.up_metadata = parent._metadata.RowGroup(index)
-        self.metadata = self.up_metadata.get()
-        self.parent = parent
-        self.index = index
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor 
directly")
+
+    def __cinit__(self):
+        pass
 
     def __reduce__(self):
         return RowGroupMetaData, (self.parent, self.index)
@@ -787,7 +795,7 @@ cdef class RowGroupMetaData(_Weakrefable):
         """
         if i < 0 or i >= self.num_columns:
             raise IndexError('{0} out of bounds'.format(i))
-        chunk = ColumnChunkMetaData()
+        cdef ColumnChunkMetaData chunk = 
ColumnChunkMetaData.__new__(ColumnChunkMetaData)
         chunk.init(self, i)
         return chunk
 
@@ -866,6 +874,9 @@ def _reconstruct_filemetadata(Buffer serialized):
 cdef class FileMetaData(_Weakrefable):
     """Parquet metadata for a single file."""
 
+    def __init__(self):
+        raise TypeError(f"Do not call {self.__class__.__name__}'s constructor 
directly")
+
     def __cinit__(self):
         pass
 
@@ -1027,7 +1038,9 @@ cdef class FileMetaData(_Weakrefable):
         -------
         row_group_metadata : RowGroupMetaData
         """
-        return RowGroupMetaData(self, i)
+        cdef RowGroupMetaData row_group = 
RowGroupMetaData.__new__(RowGroupMetaData)
+        row_group.init(self, i)
+        return row_group
 
     def set_file_path(self, path):
         """
@@ -1516,7 +1529,8 @@ cdef class ParquetReader(_Weakrefable):
         # Set up metadata
         with nogil:
             c_metadata = builder.raw_reader().metadata()
-        self._metadata = result = FileMetaData()
+        cdef FileMetaData result = FileMetaData.__new__(FileMetaData)
+        self._metadata = result
         result.init(c_metadata)
 
         if read_dictionary is not None:
@@ -2259,7 +2273,7 @@ cdef class ParquetWriter(_Weakrefable):
         with nogil:
             metadata = self.writer.get().metadata()
         if metadata:
-            result = FileMetaData()
+            result = FileMetaData.__new__(FileMetaData)
             result.init(metadata)
             return result
         raise RuntimeError(
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index 14ce9bbfcd..cf17f830f2 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -794,3 +794,23 @@ def 
test_column_chunk_key_value_metadata(parquet_test_datadir):
     assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': 
b''}
     key_value_metadata2 = metadata.row_group(0).column(1).metadata
     assert key_value_metadata2 is None
+
+
+def test_internal_class_instantiation():
+    def msg(c):
+        return f"Do not call {c}'s constructor directly"
+
+    with pytest.raises(TypeError, match=msg("Statistics")):
+        pq.Statistics()
+
+    with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
+        pq.ParquetLogicalType()
+
+    with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
+        pq.ColumnChunkMetaData()
+
+    with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
+        pq.RowGroupMetaData()
+
+    with pytest.raises(TypeError, match=msg("FileMetaData")):
+        pq.FileMetaData()

Reply via email to