This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new cd1811be25 GH-48254: [Python][Parquet] Support extension types in
read_schema (#48255)
cd1811be25 is described below
commit cd1811be25ce2c485ad00f3be0bf0ac44ea7b423
Author: Kuinox <[email protected]>
AuthorDate: Mon May 25 16:46:07 2026 +0200
GH-48254: [Python][Parquet] Support extension types in read_schema (#48255)
### Rationale for this change
pq.read_schema drops extension types (UUID comes back as
fixed_size_binary[16]), while ParquetFile.schema_arrow and read_table preserve
them. Schema inspection via metadata should match table/extension behavior.
### What changes are included in this PR?
- Plumb arrow_extensions_enabled into read_schema and return schema_arrow
when enabled so extension types are preserved.
- Add regression test ensuring UUID extension types are retained by
read_schema and downgraded to binary(16) when extensions are disabled.
### Are these changes tested?
- Yes: added unit test test_read_schema_uuid_extension_type
### Are there any user-facing changes?
- Behavior improvement: read_schema now preserves extension types (e.g.,
UUID) when extensions are enabled; no API break
Notes:
- I don't know if the fact the column types being returned are now
extension<arrow.uuid> instead of fixed_size_binary[16], is considered a
breaking change.
- This PR patch was AI generated, but I personally reviewed it, the scope
is small, and it looks fine to me.
* GitHub Issue: #48254
Authored-by: Nicolas Vandeginste <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/parquet/core.py | 34 +++++++++++++++++++--------
python/pyarrow/tests/parquet/test_metadata.py | 22 +++++++++++++++++
2 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 080bfa55c2..ff880fdcf5 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -265,9 +265,9 @@ class ParquetFile:
page_checksum_verification : bool, default False
If True, verify the checksum for each page read from the file.
arrow_extensions_enabled : bool, default True
- If True, read Parquet logical types as Arrow extension types where
possible,
- (e.g., read JSON as the canonical `arrow.json` extension type or UUID
as
- the canonical `arrow.uuid` extension type).
+ If True, read Parquet logical types as Arrow extension types where
+ possible (e.g., read JSON as the canonical `arrow.json` extension type
+ or UUID as the canonical `arrow.uuid` extension type).
Examples
--------
@@ -2372,7 +2372,7 @@ def write_metadata(schema, where,
metadata_collector=None, filesystem=None,
def read_metadata(where, memory_map=False, decryption_properties=None,
- filesystem=None):
+ filesystem=None, arrow_extensions_enabled=True):
"""
Read FileMetaData from footer of a single Parquet file.
@@ -2387,6 +2387,10 @@ def read_metadata(where, memory_map=False,
decryption_properties=None,
If nothing passed, will be inferred based on path.
Path will try to be found in the local on-disk filesystem otherwise
it will be parsed as an URI to determine the filesystem.
+ arrow_extensions_enabled : bool, default True
+ If True, read Parquet logical types as Arrow extension types where
+ possible (e.g. UUID as the canonical `arrow.uuid` extension type).
+ If False, use the underlying storage types instead.
Returns
-------
@@ -2416,13 +2420,17 @@ def read_metadata(where, memory_map=False,
decryption_properties=None,
file_ctx = where = filesystem.open_input_file(where)
with file_ctx:
- file = ParquetFile(where, memory_map=memory_map,
- decryption_properties=decryption_properties)
+ file = ParquetFile(
+ where,
+ memory_map=memory_map,
+ decryption_properties=decryption_properties,
+ arrow_extensions_enabled=arrow_extensions_enabled,
+ )
return file.metadata
def read_schema(where, memory_map=False, decryption_properties=None,
- filesystem=None):
+ filesystem=None, arrow_extensions_enabled=True):
"""
Read effective Arrow schema from Parquet file metadata.
@@ -2437,6 +2445,9 @@ def read_schema(where, memory_map=False,
decryption_properties=None,
If nothing passed, will be inferred based on path.
Path will try to be found in the local on-disk filesystem otherwise
it will be parsed as an URI to determine the filesystem.
+ arrow_extensions_enabled : bool, default True
+ If True, read Parquet logical types as Arrow extension types where
+ possible (e.g., UUID as the canonical `arrow.uuid` extension type).
Returns
-------
@@ -2462,9 +2473,12 @@ def read_schema(where, memory_map=False,
decryption_properties=None,
with file_ctx:
file = ParquetFile(
- where, memory_map=memory_map,
- decryption_properties=decryption_properties)
- return file.schema.to_arrow_schema()
+ where,
+ memory_map=memory_map,
+ decryption_properties=decryption_properties,
+ arrow_extensions_enabled=arrow_extensions_enabled,
+ )
+ return file.schema_arrow
__all__ = (
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index 857024da22..665c9a0e8e 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -849,3 +849,25 @@ def test_internal_class_instantiation():
with pytest.raises(TypeError, match=msg("FileMetaData")):
pq.FileMetaData()
+
+
+def test_read_schema_uuid_extension_type(tmp_path):
+ # These are the raw 16-byte payloads for
+ # UUID("e460f970-8351-474e-ac7f-a4673e4ba8cb").bytes and
+ # UUID("1e741495-eed5-43ea-9bd7-73dc91424baf").bytes.
+ data = [
+ b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
+ b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
+ None,
+ ]
+ table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"])
+
+ file_path = tmp_path / "uuid.parquet"
+ file_path_str = str(file_path)
+ pq.write_table(table, file_path_str, store_schema=False)
+
+ schema_default = pq.read_schema(file_path_str)
+ assert schema_default.field("ext").type == pa.uuid()
+
+ schema_disabled = pq.read_schema(file_path_str,
arrow_extensions_enabled=False)
+ assert schema_disabled.field("ext").type == pa.binary(16)