This is an automated email from the ASF dual-hosted git repository.
kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new bad9cda1 fix: Cast smaller integer types to int32/int64 on write for
Spark compatibility (#2799)
bad9cda1 is described below
commit bad9cda1027068cf9720e3bb9f0b24d4279b13e8
Author: Somasundaram Sekar <[email protected]>
AuthorDate: Sun Jan 25 19:36:38 2026 +0100
fix: Cast smaller integer types to int32/int64 on write for Spark
compatibility (#2799)
## Summary
- Fixes #2791: Writing smaller integer types (uint8, int8, int16,
uint16) to Iceberg IntegerType columns now correctly casts to
int32/int64
- PyIceberg was preserving original Arrow types in Parquet files,
causing Spark to fail with `Unsupported logical type: UINT_8`
- Added integer type widening logic in
`ArrowProjectionVisitor._cast_if_needed()` following the same pattern as
existing timestamp handling
- Only widening conversions are allowed (e.g., uint8 → int32, int32 →
int64); narrowing conversions continue to be rejected via `promote()`
## Test plan
- [x] All 3041 unit tests pass
- [x] Lint passes
- [x] New parameterized test covers: uint8, int8, int16, uint16 → int32
and uint32, int32 → int64
- [x] Existing `test_projection_filter_add_column_demote` still works
(narrowing rejection)
- [x] Manual verification: uint8 data written to IntegerType column
produces int32 in Parquet file
Closes #2791
Co-authored-by: Somasundaram Sekar <[email protected]>
---
pyiceberg/io/pyarrow.py | 9 +++++++++
tests/io/test_pyarrow.py | 32 ++++++++++++++++++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 55ecc7ac..d07510d4 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1910,6 +1910,15 @@ class
ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
elif target_type.unit == "us" and values.type.unit in
{"s", "ms", "us"}:
return values.cast(target_type)
raise ValueError(f"Unsupported schema projection from
{values.type} to {target_type}")
+ elif isinstance(field.field_type, (IntegerType, LongType)):
+ # Cast smaller integer types to target type for
cross-platform compatibility
+ # Only allow widening conversions (smaller bit width to
larger)
+ # Narrowing conversions fall through to promote() handling
below
+ if pa.types.is_integer(values.type):
+ source_width = values.type.bit_width
+ target_width = target_type.bit_width
+ if source_width < target_width:
+ return values.cast(target_type)
if field.field_type != file_field.field_type:
target_schema = schema_to_pyarrow(
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index f1ed109d..e815ea9d 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -2752,6 +2752,38 @@ def
test__to_requested_schema_timestamps_without_downcast_raises_exception(
assert "Unsupported schema projection from timestamp[ns] to timestamp[us]"
in str(exc_info.value)
[email protected](
+ "arrow_type,iceberg_type,expected_arrow_type",
+ [
+ (pa.uint8(), IntegerType(), pa.int32()),
+ (pa.int8(), IntegerType(), pa.int32()),
+ (pa.int16(), IntegerType(), pa.int32()),
+ (pa.uint16(), IntegerType(), pa.int32()),
+ (pa.uint32(), LongType(), pa.int64()),
+ (pa.int32(), LongType(), pa.int64()),
+ ],
+)
+def test__to_requested_schema_integer_promotion(
+ arrow_type: pa.DataType,
+ iceberg_type: PrimitiveType,
+ expected_arrow_type: pa.DataType,
+) -> None:
+ """Test that smaller integer types are cast to target Iceberg type during
write."""
+ requested_schema = Schema(NestedField(1, "col", iceberg_type,
required=False))
+ file_schema = requested_schema
+
+ arrow_schema = pa.schema([pa.field("col", arrow_type)])
+ data = pa.array([1, 2, 3, None], type=arrow_type)
+ batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema)
+
+ result = _to_requested_schema(
+ requested_schema, file_schema, batch,
downcast_ns_timestamp_to_us=False, include_field_ids=False
+ )
+
+ assert result.schema[0].type == expected_arrow_type
+ assert result.column(0).to_pylist() == [1, 2, 3, None]
+
+
def test_pyarrow_file_io_fs_by_scheme_cache() -> None:
# It's better to set up multi-region minio servers for an integration test
once `endpoint_url` argument becomes available for `resolve_s3_region`
# Refer to: https://github.com/apache/arrow/issues/43713