This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 419ca89dbb GH-36628: [Python][Parquet] Fail when instantiating
internal Parquet metadata classes (#45549)
419ca89dbb is described below
commit 419ca89dbb3df12dd9b3f1caae736d489bc2de2a
Author: Tien Nguyen <[email protected]>
AuthorDate: Mon Mar 10 11:38:16 2025 -0400
GH-36628: [Python][Parquet] Fail when instantiating internal Parquet
metadata classes (#45549)
### Rationale for this change
As described in https://github.com/apache/arrow/issues/36628, we want to
raise an exception instead of showing segfaults when users try to instantiate
some internal Parquet metadata classes.
### What changes are included in this PR?
This PR is a carry-on of https://github.com/apache/arrow/pull/44245. I
modified a few things and covered more test cases. This PR includes these
changes:
* Raise an exception when users try to instantiate `Statistics`,
`ParquetLogicalType`, `ColumnChunkMetaData`, `RowGroupMetaData`, `FileMetaData`
### Are these changes tested?
Yes, added unit tests
### Are there any user-facing changes?
Yes, after this change, users can't instantiate these classes anymore using
the regular `__init__` workflow
* GitHub Issue: #36628
Authored-by: Tim Nguyen <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/_dataset_parquet.pyx | 4 +--
python/pyarrow/_parquet.pxd | 9 ++++++
python/pyarrow/_parquet.pyx | 40 ++++++++++++++++++---------
python/pyarrow/tests/parquet/test_metadata.py | 20 ++++++++++++++
4 files changed, 58 insertions(+), 15 deletions(-)
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index 863c928591..45db755d90 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -164,7 +164,7 @@ cdef class ParquetFileFormat(FileFormat):
metadata = deref(
deref(parquet_file_writer).parquet_writer()).metadata()
if metadata:
- parquet_metadata = FileMetaData()
+ parquet_metadata = FileMetaData.__new__(FileMetaData)
parquet_metadata.init(metadata)
parquet_metadata.set_file_path(os.path.relpath(path, base_dir))
@@ -390,7 +390,7 @@ cdef class ParquetFileFragment(FileFragment):
@property
def metadata(self):
self.ensure_complete_metadata()
- cdef FileMetaData metadata = FileMetaData()
+ cdef FileMetaData metadata = FileMetaData.__new__(FileMetaData)
metadata.init(self.parquet_file_fragment.metadata())
return metadata
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index c17c3b70d7..1e3c89e4e7 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -631,6 +631,15 @@ cdef class RowGroupMetaData(_Weakrefable):
CRowGroupMetaData* metadata
FileMetaData parent
+ cdef inline init(self, FileMetaData parent, int index):
+ if index < 0 or index >= parent.num_row_groups:
+ raise IndexError('{0} out of bounds'.format(index))
+ self.up_metadata = parent._metadata.RowGroup(index)
+ self.metadata = self.up_metadata.get()
+ self.parent = parent
+ self.index = index
+
+
cdef class ColumnChunkMetaData(_Weakrefable):
cdef:
unique_ptr[CColumnChunkMetaData] up_metadata
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 2fb1e41641..ecc4e57091 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -52,6 +52,9 @@ _MAX_ROW_GROUP_SIZE = 64*1024*1024
cdef class Statistics(_Weakrefable):
"""Statistics for a single column in a single row group."""
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor
directly")
+
def __cinit__(self):
pass
@@ -217,6 +220,10 @@ cdef class Statistics(_Weakrefable):
cdef class ParquetLogicalType(_Weakrefable):
"""Logical type of parquet type."""
+
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor
directly")
+
cdef:
shared_ptr[const CParquetLogicalType] type
@@ -253,7 +260,7 @@ cdef class ParquetLogicalType(_Weakrefable):
cdef wrap_logical_type(const shared_ptr[const CParquetLogicalType]& type):
- cdef ParquetLogicalType out = ParquetLogicalType()
+ cdef ParquetLogicalType out =
ParquetLogicalType.__new__(ParquetLogicalType)
out.init(type)
return out
@@ -315,6 +322,9 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
cdef class ColumnChunkMetaData(_Weakrefable):
"""Column metadata for a single row group."""
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor
directly")
+
def __cinit__(self):
pass
@@ -436,7 +446,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
"""Statistics for column chunk (:class:`Statistics`)."""
if not self.metadata.is_stats_set():
return None
- statistics = Statistics()
+ cdef Statistics statistics = Statistics.__new__(Statistics)
statistics.init(self.metadata.statistics(), self)
return statistics
@@ -739,13 +749,11 @@ cdef class SortingColumn:
cdef class RowGroupMetaData(_Weakrefable):
"""Metadata for a single row group."""
- def __cinit__(self, FileMetaData parent, int index):
- if index < 0 or index >= parent.num_row_groups:
- raise IndexError('{0} out of bounds'.format(index))
- self.up_metadata = parent._metadata.RowGroup(index)
- self.metadata = self.up_metadata.get()
- self.parent = parent
- self.index = index
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor
directly")
+
+ def __cinit__(self):
+ pass
def __reduce__(self):
return RowGroupMetaData, (self.parent, self.index)
@@ -787,7 +795,7 @@ cdef class RowGroupMetaData(_Weakrefable):
"""
if i < 0 or i >= self.num_columns:
raise IndexError('{0} out of bounds'.format(i))
- chunk = ColumnChunkMetaData()
+ cdef ColumnChunkMetaData chunk =
ColumnChunkMetaData.__new__(ColumnChunkMetaData)
chunk.init(self, i)
return chunk
@@ -866,6 +874,9 @@ def _reconstruct_filemetadata(Buffer serialized):
cdef class FileMetaData(_Weakrefable):
"""Parquet metadata for a single file."""
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor
directly")
+
def __cinit__(self):
pass
@@ -1027,7 +1038,9 @@ cdef class FileMetaData(_Weakrefable):
-------
row_group_metadata : RowGroupMetaData
"""
- return RowGroupMetaData(self, i)
+ cdef RowGroupMetaData row_group =
RowGroupMetaData.__new__(RowGroupMetaData)
+ row_group.init(self, i)
+ return row_group
def set_file_path(self, path):
"""
@@ -1516,7 +1529,8 @@ cdef class ParquetReader(_Weakrefable):
# Set up metadata
with nogil:
c_metadata = builder.raw_reader().metadata()
- self._metadata = result = FileMetaData()
+ cdef FileMetaData result = FileMetaData.__new__(FileMetaData)
+ self._metadata = result
result.init(c_metadata)
if read_dictionary is not None:
@@ -2259,7 +2273,7 @@ cdef class ParquetWriter(_Weakrefable):
with nogil:
metadata = self.writer.get().metadata()
if metadata:
- result = FileMetaData()
+ result = FileMetaData.__new__(FileMetaData)
result.init(metadata)
return result
raise RuntimeError(
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 14ce9bbfcd..cf17f830f2 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -794,3 +794,23 @@ def
test_column_chunk_key_value_metadata(parquet_test_datadir):
assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue':
b''}
key_value_metadata2 = metadata.row_group(0).column(1).metadata
assert key_value_metadata2 is None
+
+
+def test_internal_class_instantiation():
+ def msg(c):
+ return f"Do not call {c}'s constructor directly"
+
+ with pytest.raises(TypeError, match=msg("Statistics")):
+ pq.Statistics()
+
+ with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
+ pq.ParquetLogicalType()
+
+ with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
+ pq.ColumnChunkMetaData()
+
+ with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
+ pq.RowGroupMetaData()
+
+ with pytest.raises(TypeError, match=msg("FileMetaData")):
+ pq.FileMetaData()