(arrow) branch main updated: GH-48254: [Python][Parquet] Support extension types in read_schema (#48255)

raulcd Mon, 25 May 2026 07:46:24 -0700

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new cd1811be25 GH-48254: [Python][Parquet] Support extension types in 
read_schema (#48255)
cd1811be25 is described below

commit cd1811be25ce2c485ad00f3be0bf0ac44ea7b423
Author: Kuinox <[email protected]>
AuthorDate: Mon May 25 16:46:07 2026 +0200

    GH-48254: [Python][Parquet] Support extension types in read_schema (#48255)
    
    ### Rationale for this change
    
      pq.read_schema drops extension types (UUID comes back as 
fixed_size_binary[16]), while ParquetFile.schema_arrow and read_table preserve 
them. Schema inspection via metadata should match table/extension behavior.
    
      ### What changes are included in this PR?
    
      - Plumb arrow_extensions_enabled into read_schema and return schema_arrow 
when enabled so extension types are preserved.
      - Add regression test ensuring UUID extension types are retained by 
read_schema and downgraded to binary(16) when extensions are disabled.
    
      ### Are these changes tested?
    
      - Yes: added unit test test_read_schema_uuid_extension_type
    
      ### Are there any user-facing changes?
    
      - Behavior improvement: read_schema now preserves extension types (e.g., 
UUID) when extensions are enabled; no API break
    
    Notes:
    - I don't know if the fact the column types being returned are now 
extension<arrow.uuid> instead of fixed_size_binary[16], is considered a 
breaking change.
    - This PR patch was AI generated, but I personally reviewed it, the scope 
is small, and it looks fine to me.
    * GitHub Issue: #48254
    
    Authored-by: Nicolas Vandeginste <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/parquet/core.py                | 34 +++++++++++++++++++--------
 python/pyarrow/tests/parquet/test_metadata.py | 22 +++++++++++++++++
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 080bfa55c2..ff880fdcf5 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -265,9 +265,9 @@ class ParquetFile:
     page_checksum_verification : bool, default False
         If True, verify the checksum for each page read from the file.
     arrow_extensions_enabled : bool, default True
-        If True, read Parquet logical types as Arrow extension types where 
possible,
-        (e.g., read JSON as the canonical `arrow.json` extension type or UUID 
as
-        the canonical `arrow.uuid` extension type).
+        If True, read Parquet logical types as Arrow extension types where
+        possible (e.g., read JSON as the canonical `arrow.json` extension type
+        or UUID as the canonical `arrow.uuid` extension type).
 
     Examples
     --------
@@ -2372,7 +2372,7 @@ def write_metadata(schema, where, 
metadata_collector=None, filesystem=None,
 
 
 def read_metadata(where, memory_map=False, decryption_properties=None,
-                  filesystem=None):
+                  filesystem=None, arrow_extensions_enabled=True):
     """
     Read FileMetaData from footer of a single Parquet file.
 
@@ -2387,6 +2387,10 @@ def read_metadata(where, memory_map=False, 
decryption_properties=None,
         If nothing passed, will be inferred based on path.
         Path will try to be found in the local on-disk filesystem otherwise
         it will be parsed as an URI to determine the filesystem.
+    arrow_extensions_enabled : bool, default True
+        If True, read Parquet logical types as Arrow extension types where
+        possible (e.g. UUID as the canonical `arrow.uuid` extension type).
+        If False, use the underlying storage types instead.
 
     Returns
     -------
@@ -2416,13 +2420,17 @@ def read_metadata(where, memory_map=False, 
decryption_properties=None,
         file_ctx = where = filesystem.open_input_file(where)
 
     with file_ctx:
-        file = ParquetFile(where, memory_map=memory_map,
-                           decryption_properties=decryption_properties)
+        file = ParquetFile(
+            where,
+            memory_map=memory_map,
+            decryption_properties=decryption_properties,
+            arrow_extensions_enabled=arrow_extensions_enabled,
+        )
         return file.metadata
 
 
 def read_schema(where, memory_map=False, decryption_properties=None,
-                filesystem=None):
+                filesystem=None, arrow_extensions_enabled=True):
     """
     Read effective Arrow schema from Parquet file metadata.
 
@@ -2437,6 +2445,9 @@ def read_schema(where, memory_map=False, 
decryption_properties=None,
         If nothing passed, will be inferred based on path.
         Path will try to be found in the local on-disk filesystem otherwise
         it will be parsed as an URI to determine the filesystem.
+    arrow_extensions_enabled : bool, default True
+        If True, read Parquet logical types as Arrow extension types where
+        possible (e.g., UUID as the canonical `arrow.uuid` extension type).
 
     Returns
     -------
@@ -2462,9 +2473,12 @@ def read_schema(where, memory_map=False, 
decryption_properties=None,
 
     with file_ctx:
         file = ParquetFile(
-            where, memory_map=memory_map,
-            decryption_properties=decryption_properties)
-        return file.schema.to_arrow_schema()
+            where,
+            memory_map=memory_map,
+            decryption_properties=decryption_properties,
+            arrow_extensions_enabled=arrow_extensions_enabled,
+        )
+        return file.schema_arrow
 
 
 __all__ = (
diff --git a/python/pyarrow/tests/parquet/test_metadata.py 
b/python/pyarrow/tests/parquet/test_metadata.py
index 857024da22..665c9a0e8e 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -849,3 +849,25 @@ def test_internal_class_instantiation():
 
     with pytest.raises(TypeError, match=msg("FileMetaData")):
         pq.FileMetaData()
+
+
+def test_read_schema_uuid_extension_type(tmp_path):
+    # These are the raw 16-byte payloads for
+    # UUID("e460f970-8351-474e-ac7f-a4673e4ba8cb").bytes and
+    # UUID("1e741495-eed5-43ea-9bd7-73dc91424baf").bytes.
+    data = [
+        b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
+        b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
+        None,
+    ]
+    table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"])
+
+    file_path = tmp_path / "uuid.parquet"
+    file_path_str = str(file_path)
+    pq.write_table(table, file_path_str, store_schema=False)
+
+    schema_default = pq.read_schema(file_path_str)
+    assert schema_default.field("ext").type == pa.uuid()
+
+    schema_disabled = pq.read_schema(file_path_str, 
arrow_extensions_enabled=False)
+    assert schema_disabled.field("ext").type == pa.binary(16)

(arrow) branch main updated: GH-48254: [Python][Parquet] Support extension types in read_schema (#48255)

Reply via email to