This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 56d1ec1142 GH-39780: [Python][Parquet] Support hashing for
FileMetaData and ParquetSchema (#39781)
56d1ec1142 is described below
commit 56d1ec1142cd95f1c2d35805396262713fa5b29f
Author: Miles <[email protected]>
AuthorDate: Mon Feb 12 14:29:34 2024 +0100
GH-39780: [Python][Parquet] Support hashing for FileMetaData and
ParquetSchema (#39781)
I think the hash, especially for `FileMetaData` could be better, maybe just
use return of `__repr__`, even though that won't include row group info?
### Rationale for this change
Helpful for dependent projects.
### What changes are included in this PR?
Impl `__hash__` for `ParquetSchema` and `FileMetaData`
### Are these changes tested?
Yes
### Are there any user-facing changes?
Supports hashing metadata:
```python
In [1]: import pyarrow.parquet as pq
In [2]: f = pq.ParquetFile('test.parquet')
In [3]: hash(f.metadata)
Out[3]: 4816453453708427907
In [4]: hash(f.metadata.schema)
Out[4]: 2300988959078172540
```
* Closes: #39780
Authored-by: Miles Granger <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/_parquet.pyx | 10 ++++++++++
python/pyarrow/tests/parquet/test_metadata.py | 26 ++++++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 0b68524565..7bc68a288a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -849,6 +849,13 @@ cdef class FileMetaData(_Weakrefable):
cdef Buffer buffer = sink.getvalue()
return _reconstruct_filemetadata, (buffer,)
+ def __hash__(self):
+ return hash((self.schema,
+ self.num_rows,
+ self.num_row_groups,
+ self.format_version,
+ self.serialized_size))
+
def __repr__(self):
return """{0}
created_by: {1}
@@ -1071,6 +1078,9 @@ cdef class ParquetSchema(_Weakrefable):
def __getitem__(self, i):
return self.column(i)
+ def __hash__(self):
+ return hash(self.schema.ToString())
+
@property
def names(self):
"""Name of each field (list of str)."""
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 73284d2e53..bf186bd923 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -499,6 +499,32 @@ def test_multi_dataset_metadata(tempdir):
assert md['serialized_size'] > 0
+def test_metadata_hashing(tempdir):
+ path1 = str(tempdir / "metadata1")
+ schema1 = pa.schema([("a", "int64"), ("b", "float64")])
+ pq.write_metadata(schema1, path1)
+ parquet_meta1 = pq.read_metadata(path1)
+
+ # Same as 1, just different path
+ path2 = str(tempdir / "metadata2")
+ schema2 = pa.schema([("a", "int64"), ("b", "float64")])
+ pq.write_metadata(schema2, path2)
+ parquet_meta2 = pq.read_metadata(path2)
+
+ # different schema
+ path3 = str(tempdir / "metadata3")
+ schema3 = pa.schema([("a", "int64"), ("b", "float32")])
+ pq.write_metadata(schema3, path3)
+ parquet_meta3 = pq.read_metadata(path3)
+
+ # Deterministic
+ assert hash(parquet_meta1) == hash(parquet_meta1) # equal w/ same instance
+ assert hash(parquet_meta1) == hash(parquet_meta2) # equal w/ different
instance
+
+ # Not the same as other metadata with different schema
+ assert hash(parquet_meta1) != hash(parquet_meta3)
+
+
@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
def test_write_metadata(tempdir):
path = str(tempdir / "metadata")