This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 33438bdb Add missing ORC iceberg.required attribute (#2789)
33438bdb is described below

commit 33438bdb1e78f8122ac2d321b11157ab3ef69334
Author: Yuya Ebihara <[email protected]>
AuthorDate: Mon Dec 1 13:45:16 2025 +0900

    Add missing ORC iceberg.required attribute (#2789)
    
    # Rationale for this change
    
    Iceberg spec expects `iceberg.required` attribute in addition to
    `iceberg.id`:
    > The column IDs must be stored in ORC type attributes using the key
    `iceberg.id`, and `iceberg.required` to store "true" if the Iceberg
    column is required, otherwise it will be optional.
    
    https://iceberg.apache.org/spec/#orc
    
    Fixes #2526
    
    
    ## Are these changes tested?
    
    Yes
    
    ## Are there any user-facing changes?
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/io/pyarrow.py  |  3 +++
 tests/io/test_pyarrow.py | 42 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index ad4c761d..d98e3fa7 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -196,6 +196,7 @@ ICEBERG_SCHEMA = b"iceberg.schema"
 PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
 # ORC field ID key for Iceberg field IDs in ORC metadata
 ORC_FIELD_ID_KEY = b"iceberg.id"
+ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
 PYARROW_FIELD_DOC_KEY = b"doc"
 LIST_ELEMENT_NAME = "element"
 MAP_KEY_NAME = "key"
@@ -717,6 +718,8 @@ class 
_ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]):
             else:
                 # Default to Parquet for backward compatibility
                 metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
+        if self._file_format == FileFormat.ORC:
+            metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
 
         return pa.field(
             name=field.name,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 3977dc21..869e60f4 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
     id_field_no_ids = arrow_schema_no_ids.field(0)
     name_field_no_ids = arrow_schema_no_ids.field(1)
 
-    assert not id_field_no_ids.metadata
-    assert not name_field_no_ids.metadata
+    assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
+    assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
+    assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
+    assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
+
+
+def test_orc_schema_conversion_with_required_attribute() -> None:
+    """
+    Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
+    To run just this test:
+        pytest tests/io/test_pyarrow.py -k 
test_orc_schema_conversion_with_required_attribute
+    """
+    from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
+    from pyiceberg.manifest import FileFormat
+    from pyiceberg.schema import Schema
+    from pyiceberg.types import IntegerType, StringType
+
+    # Define schema
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(2, "name", StringType(), required=False),
+    )
+
+    # Test 1: Specify Parquet format
+    arrow_schema_default = schema_to_pyarrow(schema, 
file_format=FileFormat.PARQUET)
+
+    id_field = arrow_schema_default.field(0)
+    name_field = arrow_schema_default.field(1)
+
+    assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
+    assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
+
+    # Test 2: Specify ORC format
+    arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
+
+    id_field_orc = arrow_schema_orc.field(0)
+    name_field_orc = arrow_schema_orc.field(1)
+
+    assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
+    assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"
 
 
 def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:

Reply via email to