This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 56d1ec1142 GH-39780: [Python][Parquet] Support hashing for 
FileMetaData and ParquetSchema (#39781)
56d1ec1142 is described below

commit 56d1ec1142cd95f1c2d35805396262713fa5b29f
Author: Miles <[email protected]>
AuthorDate: Mon Feb 12 14:29:34 2024 +0100

    GH-39780: [Python][Parquet] Support hashing for FileMetaData and 
ParquetSchema (#39781)
    
    I think the hash, especially for `FileMetaData` could be better, maybe just 
use return of `__repr__`, even though that won't include row group info?
    
    ### Rationale for this change
    
    Helpful for dependent projects.
    
    ### What changes are included in this PR?
    
    Impl `__hash__` for `ParquetSchema` and `FileMetaData`
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Supports hashing metadata:
    
    ```python
    In [1]: import pyarrow.parquet as pq
    
    In [2]: f = pq.ParquetFile('test.parquet')
    
    In [3]: hash(f.metadata)
    Out[3]: 4816453453708427907
    
    In [4]: hash(f.metadata.schema)
    Out[4]: 2300988959078172540
    ```
    * Closes: #39780
    
    Authored-by: Miles Granger <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/_parquet.pyx                   | 10 ++++++++++
 python/pyarrow/tests/parquet/test_metadata.py | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 0b68524565..7bc68a288a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -849,6 +849,13 @@ cdef class FileMetaData(_Weakrefable):
         cdef Buffer buffer = sink.getvalue()
         return _reconstruct_filemetadata, (buffer,)
 
+    def __hash__(self):
+        return hash((self.schema,
+                     self.num_rows,
+                     self.num_row_groups,
+                     self.format_version,
+                     self.serialized_size))
+
     def __repr__(self):
         return """{0}
   created_by: {1}
@@ -1071,6 +1078,9 @@ cdef class ParquetSchema(_Weakrefable):
     def __getitem__(self, i):
         return self.column(i)
 
+    def __hash__(self):
+        return hash(self.schema.ToString())
+
     @property
     def names(self):
         """Name of each field (list of str)."""
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index 73284d2e53..bf186bd923 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -499,6 +499,32 @@ def test_multi_dataset_metadata(tempdir):
     assert md['serialized_size'] > 0
 
 
+def test_metadata_hashing(tempdir):
+    path1 = str(tempdir / "metadata1")
+    schema1 = pa.schema([("a", "int64"), ("b", "float64")])
+    pq.write_metadata(schema1, path1)
+    parquet_meta1 = pq.read_metadata(path1)
+
+    # Same as 1, just different path
+    path2 = str(tempdir / "metadata2")
+    schema2 = pa.schema([("a", "int64"), ("b", "float64")])
+    pq.write_metadata(schema2, path2)
+    parquet_meta2 = pq.read_metadata(path2)
+
+    # different schema
+    path3 = str(tempdir / "metadata3")
+    schema3 = pa.schema([("a", "int64"), ("b", "float32")])
+    pq.write_metadata(schema3, path3)
+    parquet_meta3 = pq.read_metadata(path3)
+
+    # Deterministic
+    assert hash(parquet_meta1) == hash(parquet_meta1)  # equal w/ same instance
+    assert hash(parquet_meta1) == hash(parquet_meta2)  # equal w/ different 
instance
+
+    # Not the same as other metadata with different schema
+    assert hash(parquet_meta1) != hash(parquet_meta3)
+
+
 @pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
 def test_write_metadata(tempdir):
     path = str(tempdir / "metadata")

Reply via email to