This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 400a6d9fec GH-49927: [Python][Parquet] Expose bloom_filter_offset and
bloom_filter_length to Python in column chunk metadata (#49926)
400a6d9fec is described below
commit 400a6d9fecacaf514798ef103952b86ff90659de
Author: Haziq Hakimi Mazlisham <[email protected]>
AuthorDate: Mon May 11 18:19:04 2026 +0800
GH-49927: [Python][Parquet] Expose bloom_filter_offset and
bloom_filter_length to Python in column chunk metadata (#49926)
### Rationale for this change
`ColumnChunkMetaData.to_dict()` method omits `bloom_filter_offset` and
`bloom_filter_length` even when a bloom filter is written to the Parquet file.
Users cannot programmatically verify bloom filter presence via the Python
metadata API without resorting to file size comparison.
### What changes are included in this PR?
1. `python/pyarrow/includes/libparquet.pxd`: Declare
`bloom_filter_offset()` and `bloom_filter_length()` (both optional[int64_t]) on
`CColumnChunkMetaData`. This is to expose the existing C++ methods to Cython.
2. `python/pyarrow/_parquet.pyx`: Add `bloom_filter_offset` and
`bloom_filter_length` properties to `ColumnChunkMetaData` (returns int when
set, None otherwise). Add both fields to `to_dict()` and `__repr__`.
3. `python/pyarrow/tests/parquet/test_metadata.py`: Add
`test_bloom_filter_offset_in_metadata` verifying that columns with a bloom
filter expose non-None integer values and that `to_dict()` contains the keys,
while columns without a bloom filter return None.
### Are these changes tested?
Yes. `test_bloom_filter_offset_in_metadata` in test_metadata.py covers:
- Column with bloom filter: bloom_filter_offset and bloom_filter_length are
non-None integers
- Column without bloom filter: both return None
- Both keys present in to_dict() output
<img width="863" height="215" alt="image"
src="https://github.com/user-attachments/assets/d465d6bd-55d1-4c5f-9f11-6a60b3bf1cbe"
/>
Here is closer look on the logic output:
<img width="464" height="424" alt="image"
src="https://github.com/user-attachments/assets/6e4810f2-c1c0-41ea-b559-00f99d42e2c4"
/>
output:
```python
col_a bloom_filter_offset: 10699
col_a bloom_filter_length: 1040
col_b bloom_filter_offset: None
col_b bloom_filter_length: None
col_a to_dict(): {'file_offset': 0, 'file_path': '', 'physical_type':
'BYTE_ARRAY', 'num_values': 1000, 'path_in_schema': 'a', 'is_stats_set': True,
'statistics': {'has_min_max': True, 'min': 'id_0', 'max': 'id_999',
'null_count': 0, 'distinct_count': None, 'num_values': 1000, 'physical_type':
'BYTE_ARRAY'}, 'geo_statistics': None, 'compression': 'SNAPPY', 'encodings':
('PLAIN', 'RLE', 'RLE_DICTIONARY'), 'has_dictionary_page': True,
'dictionary_page_offset': 4, 'data_page_offset': 403 [...]
```
* GitHub Issue: #49927
Lead-authored-by: Haziq Hakimi Mazlisham
<[email protected]>
Co-authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
docs/source/python/parquet.rst | 2 ++
python/pyarrow/_parquet.pyx | 24 ++++++++++++++++--
python/pyarrow/includes/libparquet.pxd | 2 ++
python/pyarrow/tests/parquet/test_metadata.py | 35 +++++++++++++++++++++++++++
4 files changed, 61 insertions(+), 2 deletions(-)
diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst
index 2c42d97f98..2c6ed0d12d 100644
--- a/docs/source/python/parquet.rst
+++ b/docs/source/python/parquet.rst
@@ -332,6 +332,8 @@ such as the row groups and column chunk metadata and
statistics:
data_page_offset: 36
total_compressed_size: 106
total_uncompressed_size: 102
+ bloom_filter_offset: None
+ bloom_filter_length: None
Data Type Handling
------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 1448d90ec5..2358a961eb 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -475,7 +475,9 @@ cdef class ColumnChunkMetaData(_Weakrefable):
dictionary_page_offset: {self.dictionary_page_offset}
data_page_offset: {self.data_page_offset}
total_compressed_size: {self.total_compressed_size}
- total_uncompressed_size: {self.total_uncompressed_size}"""
+ total_uncompressed_size: {self.total_uncompressed_size}
+ bloom_filter_offset: {self.bloom_filter_offset}
+ bloom_filter_length: {self.bloom_filter_length}"""
def to_dict(self):
"""
@@ -507,7 +509,9 @@ cdef class ColumnChunkMetaData(_Weakrefable):
dictionary_page_offset=self.dictionary_page_offset,
data_page_offset=self.data_page_offset,
total_compressed_size=self.total_compressed_size,
- total_uncompressed_size=self.total_uncompressed_size
+ total_uncompressed_size=self.total_uncompressed_size,
+ bloom_filter_offset=self.bloom_filter_offset,
+ bloom_filter_length=self.bloom_filter_length,
)
return d
@@ -645,6 +649,22 @@ cdef class ColumnChunkMetaData(_Weakrefable):
"""Uncompressed size in bytes (int)."""
return self.metadata.total_uncompressed_size()
+ @property
+ def bloom_filter_offset(self):
+ """Offset of bloom filter relative to beginning of the file (int or
None)."""
+ cdef optional[int64_t] offset = self.metadata.bloom_filter_offset()
+ if offset.has_value():
+ return offset.value()
+ return None
+
+ @property
+ def bloom_filter_length(self):
+ """Length of bloom filter in bytes (int or None)."""
+ cdef optional[int64_t] length = self.metadata.bloom_filter_length()
+ if length.has_value():
+ return length.value()
+ return None
+
@property
def has_offset_index(self):
"""Whether the column chunk has an offset index"""
diff --git a/python/pyarrow/includes/libparquet.pxd
b/python/pyarrow/includes/libparquet.pxd
index bbbac67c02..a834bd5dfa 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -370,6 +370,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet"
nogil:
int64_t index_page_offset() const
int64_t total_compressed_size() const
int64_t total_uncompressed_size() const
+ optional[int64_t] bloom_filter_offset() const
+ optional[int64_t] bloom_filter_length() const
unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
optional[ParquetIndexLocation] GetColumnIndexLocation() const
optional[ParquetIndexLocation] GetOffsetIndexLocation() const
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 148bfebaa6..857024da22 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -796,6 +796,41 @@ def
test_column_chunk_key_value_metadata(parquet_test_datadir):
assert key_value_metadata2 is None
+def test_bloom_filter_offset_in_metadata():
+ # ColumnChunkMetaData.to_dict() when a bloom filter is written.
+ table = pa.table({"a": [f"id_{i}" for i in range(1000)],
+ "b": list(range(1000))})
+
+ buf = pa.BufferOutputStream()
+ pq.write_table(
+ table,
+ buf,
+ bloom_filter_options={"a": {"ndv": 1000}} # apply bloom filter on col
a
+ )
+ metadata = pq.read_metadata(pa.BufferReader(buf.getvalue()))
+
+ col_a = metadata.row_group(0).column(0) # bloom filter written
+ col_b = metadata.row_group(0).column(1) # no bloom filter
+
+ assert col_a.bloom_filter_offset is not None
+ assert isinstance(col_a.bloom_filter_offset, int)
+ assert col_a.bloom_filter_length is not None
+ assert isinstance(col_a.bloom_filter_length, int)
+
+ assert col_b.bloom_filter_offset is None
+ assert col_b.bloom_filter_length is None
+
+ d = col_a.to_dict()
+ assert "bloom_filter_offset" in d
+ assert "bloom_filter_length" in d
+ assert d["bloom_filter_offset"] == col_a.bloom_filter_offset
+ assert d["bloom_filter_length"] == col_a.bloom_filter_length
+
+ d_no_bloom = col_b.to_dict()
+ assert d_no_bloom["bloom_filter_offset"] is None
+ assert d_no_bloom["bloom_filter_length"] is None
+
+
def test_internal_class_instantiation():
def msg(c):
return f"Do not call {c}'s constructor directly"