This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 33438bdb Add missing ORC iceberg.required attribute (#2789)
33438bdb is described below
commit 33438bdb1e78f8122ac2d321b11157ab3ef69334
Author: Yuya Ebihara <[email protected]>
AuthorDate: Mon Dec 1 13:45:16 2025 +0900
Add missing ORC iceberg.required attribute (#2789)
# Rationale for this change
Iceberg spec expects `iceberg.required` attribute in addition to
`iceberg.id`:
> The column IDs must be stored in ORC type attributes using the key
`iceberg.id`, and `iceberg.required` to store "true" if the Iceberg
column is required, otherwise it will be optional.
https://iceberg.apache.org/spec/#orc
Fixes #2526
## Are these changes tested?
Yes
## Are there any user-facing changes?
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
pyiceberg/io/pyarrow.py | 3 +++
tests/io/test_pyarrow.py | 42 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index ad4c761d..d98e3fa7 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -196,6 +196,7 @@ ICEBERG_SCHEMA = b"iceberg.schema"
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
# ORC field ID key for Iceberg field IDs in ORC metadata
ORC_FIELD_ID_KEY = b"iceberg.id"
+ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
PYARROW_FIELD_DOC_KEY = b"doc"
LIST_ELEMENT_NAME = "element"
MAP_KEY_NAME = "key"
@@ -717,6 +718,8 @@ class
_ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]):
else:
# Default to Parquet for backward compatibility
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
+ if self._file_format == FileFormat.ORC:
+ metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
return pa.field(
name=field.name,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 3977dc21..869e60f4 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
id_field_no_ids = arrow_schema_no_ids.field(0)
name_field_no_ids = arrow_schema_no_ids.field(1)
- assert not id_field_no_ids.metadata
- assert not name_field_no_ids.metadata
+ assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
+ assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
+ assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
+ assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
+
+
+def test_orc_schema_conversion_with_required_attribute() -> None:
+ """
+ Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
+ To run just this test:
+ pytest tests/io/test_pyarrow.py -k
test_orc_schema_conversion_with_required_attribute
+ """
+ from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
+ from pyiceberg.manifest import FileFormat
+ from pyiceberg.schema import Schema
+ from pyiceberg.types import IntegerType, StringType
+
+ # Define schema
+ schema = Schema(
+ NestedField(1, "id", IntegerType(), required=True),
+ NestedField(2, "name", StringType(), required=False),
+ )
+
+ # Test 1: Specify Parquet format
+ arrow_schema_default = schema_to_pyarrow(schema,
file_format=FileFormat.PARQUET)
+
+ id_field = arrow_schema_default.field(0)
+ name_field = arrow_schema_default.field(1)
+
+ assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
+ assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
+
+ # Test 2: Specify ORC format
+ arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
+
+ id_field_orc = arrow_schema_orc.field(0)
+ name_field_orc = arrow_schema_orc.field(1)
+
+ assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
+ assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"
def test_orc_batching_behavior_documentation(tmp_path: Path) -> None: