Re: [PR] Implement ParquetFormatModel and update write_file to use the format API [iceberg-python]

via GitHub Tue, 19 May 2026 10:34:04 -0700


rambleraptor commented on code in PR #3381:
URL: https://github.com/apache/iceberg-python/pull/3381#discussion_r3268278204



##########
tests/io/test_format_writers.py:
##########
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Parametrized format writer tests, modeled after Java's 
BaseFormatModelTests."""
+
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pytest
+
+from pyiceberg.io.fileformat import FileFormatFactory, FileFormatModel
+from pyiceberg.io.pyarrow import PyArrowFileIO
+from pyiceberg.manifest import FileFormat
+from pyiceberg.schema import Schema
+from pyiceberg.types import LongType, NestedField
+
+
[email protected](params=FileFormatFactory.available_formats(), ids=lambda f: 
f.name.lower())
+def format_model(request: pytest.FixtureRequest) -> FileFormatModel:
+    return FileFormatFactory.get(request.param)
+
+
[email protected]
+def simple_table() -> pa.Table:
+    return pa.table(
+        {
+            "foo": ["a", "b", "c"],
+            "bar": pa.array([1, 2, 3], type=pa.int32()),
+            "baz": [True, False, True],
+        }
+    )
+
+
+def test_parquet_registered() -> None:
+    """ParquetFormatModel is registered in the factory."""
+    model = FileFormatFactory.get(FileFormat.PARQUET)
+    assert model.format == FileFormat.PARQUET
+    assert model.file_extension() == "parquet"
+
+
+def test_round_trip(format_model: FileFormatModel, table_schema_simple: 
Schema, simple_table: pa.Table, tmp_path: Path) -> None:
+    """Write a table and read it back, to verify equality and record count."""
+    file_path = str(tmp_path / f"test.{format_model.file_extension()}")
+    writer = format_model.create_writer(PyArrowFileIO().new_output(file_path), 
table_schema_simple, {})
+    writer.write(simple_table)
+    statistics = writer.close()
+
+    result = ds.dataset(file_path).to_table()
+    assert result.equals(simple_table)
+    assert statistics.record_count == 3
+
+
+def test_statistics_record_count(format_model: FileFormatModel, 
table_schema_simple: Schema, tmp_path: Path) -> None:
+    """close() returns DataFileStatistics with correct record count."""
+    table = pa.table(

Review Comment:
   Why recreate a different table here?



##########
pyiceberg/io/pyarrow.py:
##########
@@ -1884,6 +1886,7 @@ def _to_requested_schema(
     include_field_ids: bool = False,
     projected_missing_fields: dict[int, Any] = EMPTY_DICT,
     allow_timestamp_tz_mismatch: bool = False,
+    file_format: FileFormat = FileFormat.PARQUET,

Review Comment:
   Same thing, not wild about the default value. 



##########
tests/io/test_format_writers.py:
##########
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Parametrized format writer tests, modeled after Java's 
BaseFormatModelTests."""
+
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pytest
+
+from pyiceberg.io.fileformat import FileFormatFactory, FileFormatModel
+from pyiceberg.io.pyarrow import PyArrowFileIO
+from pyiceberg.manifest import FileFormat
+from pyiceberg.schema import Schema
+from pyiceberg.types import LongType, NestedField
+
+
[email protected](params=FileFormatFactory.available_formats(), ids=lambda f: 
f.name.lower())
+def format_model(request: pytest.FixtureRequest) -> FileFormatModel:
+    return FileFormatFactory.get(request.param)
+
+
[email protected]
+def simple_table() -> pa.Table:

Review Comment:
   We've got a few tables in tests/conftest.py. Any reason not to use those?



##########
pyiceberg/io/pyarrow.py:
##########
@@ -1915,6 +1920,7 @@ def __init__(
         include_field_ids: bool = False,
         projected_missing_fields: dict[int, Any] = EMPTY_DICT,
         allow_timestamp_tz_mismatch: bool = False,
+        file_format: FileFormat = FileFormat.PARQUET,

Review Comment:
   I'm not wild about making PARQUET the default value (I don't think we should 
have default values...), but that's a light opinion.



##########
pyiceberg/io/pyarrow.py:
##########
@@ -1981,9 +1988,12 @@ def _construct_field(self, field: NestedField, 
arrow_type: pa.DataType) -> pa.Fi
         if field.doc:
             metadata[PYARROW_FIELD_DOC_KEY] = field.doc
         if self._include_field_ids:
-            # For projection visitor, we don't know the file format, so 
default to Parquet
-            # This is used for schema conversion during reads, not writes
-            metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
+            if self._file_format == FileFormat.ORC:

Review Comment:
   Ideally, we'd have a FileFormat API method called `add_metadata_for_field` 
(not opinionated on name).
   
   Part of the hope for the FileFormat API was to avoid these kind of switch 
statements based on the format.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Implement ParquetFormatModel and update write_file to use the format API [iceberg-python]

Reply via email to