This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 42ed37e3fc ARROW-16719: [Python] Add path/URI + filesystem handling to 
parquet.read_metadata (#13629)
42ed37e3fc is described below

commit 42ed37e3fc84465f365531e611f1bf632b599e7b
Author: Kshiteej K <[email protected]>
AuthorDate: Wed Aug 17 16:51:40 2022 +0530

    ARROW-16719: [Python] Add path/URI + filesystem handling to 
parquet.read_metadata (#13629)
    
    Add `filesystem` support to `pq.read_metadata` and `pq.read_schema`.
    
    Lead-authored-by: kshitij12345 <[email protected]>
    Co-authored-by: Kshiteej K <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 python/pyarrow/parquet/core.py                | 39 ++++++++++++++++++++++-----
 python/pyarrow/tests/parquet/test_metadata.py | 37 +++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 4e2c273953..d3a1451dcb 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -18,6 +18,7 @@
 
 from collections import defaultdict
 from concurrent import futures
+from contextlib import nullcontext
 from functools import partial, reduce
 
 import sys
@@ -3389,7 +3390,8 @@ def write_metadata(schema, where, 
metadata_collector=None, **kwargs):
         metadata.write_metadata_file(where)
 
 
-def read_metadata(where, memory_map=False, decryption_properties=None):
+def read_metadata(where, memory_map=False, decryption_properties=None,
+                  filesystem=None):
     """
     Read FileMetaData from footer of a single Parquet file.
 
@@ -3400,6 +3402,10 @@ def read_metadata(where, memory_map=False, 
decryption_properties=None):
         Create memory map when the source is a file path.
     decryption_properties : FileDecryptionProperties, default None
         Decryption properties for reading encrypted Parquet files.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
 
     Returns
     -------
@@ -3422,11 +3428,19 @@ def read_metadata(where, memory_map=False, 
decryption_properties=None):
       format_version: 2.6
       serialized_size: ...
     """
-    return ParquetFile(where, memory_map=memory_map,
-                       decryption_properties=decryption_properties).metadata
+    filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+    file_ctx = nullcontext()
+    if filesystem is not None:
+        file_ctx = where = filesystem.open_input_file(where)
+
+    with file_ctx:
+        file = ParquetFile(where, memory_map=memory_map,
+                           decryption_properties=decryption_properties)
+        return file.metadata
 
 
-def read_schema(where, memory_map=False, decryption_properties=None):
+def read_schema(where, memory_map=False, decryption_properties=None,
+                filesystem=None):
     """
     Read effective Arrow schema from Parquet file metadata.
 
@@ -3437,6 +3451,10 @@ def read_schema(where, memory_map=False, 
decryption_properties=None):
         Create memory map when the source is a file path.
     decryption_properties : FileDecryptionProperties, default None
         Decryption properties for reading encrypted Parquet files.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
 
     Returns
     -------
@@ -3454,9 +3472,16 @@ def read_schema(where, memory_map=False, 
decryption_properties=None):
     n_legs: int64
     animal: string
     """
-    return ParquetFile(
-        where, memory_map=memory_map,
-        decryption_properties=decryption_properties).schema.to_arrow_schema()
+    filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+    file_ctx = nullcontext()
+    if filesystem is not None:
+        file_ctx = where = filesystem.open_input_file(where)
+
+    with file_ctx:
+        file = ParquetFile(
+            where, memory_map=memory_map,
+            decryption_properties=decryption_properties)
+        return file.schema.to_arrow_schema()
 
 
 # re-export everything
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index b36ea60658..e4c9a757fc 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -24,6 +24,8 @@ import pytest
 
 import pyarrow as pa
 from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
+from pyarrow.fs import LocalFileSystem
+from pyarrow.tests import util
 
 try:
     import pyarrow.parquet as pq
@@ -533,6 +535,41 @@ def test_metadata_exceeds_message_size():
     metadata = pq.read_metadata(pa.BufferReader(buf))
 
 
+def test_metadata_schema_filesystem(tempdir):
+    table = pa.table({"a": [1, 2, 3]})
+
+    # URI writing to local file.
+    fname = "data.parquet"
+    file_path = str(tempdir / fname)
+    file_uri = 'file:///' + file_path
+
+    pq.write_table(table, file_path)
+
+    # Get expected `metadata` from path.
+    metadata = pq.read_metadata(tempdir / fname)
+    schema = table.schema
+
+    assert pq.read_metadata(file_uri).equals(metadata)
+    assert pq.read_metadata(
+        file_path, filesystem=LocalFileSystem()).equals(metadata)
+    assert pq.read_metadata(
+        fname, filesystem=f'file:///{tempdir}').equals(metadata)
+
+    assert pq.read_schema(file_uri).equals(schema)
+    assert pq.read_schema(
+        file_path, filesystem=LocalFileSystem()).equals(schema)
+    assert pq.read_schema(
+        fname, filesystem=f'file:///{tempdir}').equals(schema)
+
+    with util.change_cwd(tempdir):
+        # Pass `filesystem` arg
+        assert pq.read_metadata(
+            fname, filesystem=LocalFileSystem()).equals(metadata)
+
+        assert pq.read_schema(
+            fname, filesystem=LocalFileSystem()).equals(schema)
+
+
 def test_metadata_equals():
     table = pa.table({"a": [1, 2, 3]})
     with pa.BufferOutputStream() as out:

Reply via email to