GayathriSrividya commented on code in PR #3444:
URL: https://github.com/apache/iceberg-python/pull/3444#discussion_r3331504474
##########
tests/io/test_pyarrow.py:
##########
@@ -1301,6 +1302,47 @@ def test_projection_concat_files(schema_int: Schema,
file_int: str) -> None:
assert repr(result_table.schema) == "id: int32"
+def test_arrow_scan_to_table_with_mixed_dictionary_and_plain_strings() -> None:
+ schema = Schema(NestedField(1, "foo", StringType(), required=False))
+ scan = ArrowScan(
+ table_metadata=TableMetadataV2(
+ location="file://a/b/",
+ last_column_id=1,
+ format_version=2,
+ schemas=[schema],
+ partition_specs=[PartitionSpec()],
+ ),
+ io=PyArrowFileIO(),
+ projected_schema=schema,
+ row_filter=AlwaysTrue(),
+ )
+ values = pa.array(["a"], type=pa.string())
+ batches = iter([pa.record_batch([values], names=["foo"]),
pa.record_batch([values.dictionary_encode()], names=["foo"])])
+
+ with patch.object(scan, "to_record_batches", return_value=batches):
+ assert scan.to_table([]).to_pydict() == {"foo": ["a", "a"]}
+
+
+def test_pyarrow_table_ensure_non_dictionary_types_nested() -> None:
+ dictionary_values = pa.array(["a"]).dictionary_encode()
+ table = pa.table(
+ {
+ "struct": pa.StructArray.from_arrays([dictionary_values],
names=["value"]),
+ "list": pa.ListArray.from_arrays(pa.array([0, 1]),
dictionary_values),
+ }
+ )
+
+ normalized_table = _pyarrow_table_ensure_non_dictionary_types(table)
+
+ assert normalized_table.schema == pa.schema(
Review Comment:
Great point. I updated the nested normalization test to validate schema
equality with metadata checks using
[normalized_table.schema.equals(expected_schema,
check_metadata=True)](vscode-file://vscode-app/Applications/Visual%20Studio%20Code.app/Contents/Resources/app/out/vs/code/electron-browser/workbench/workbench.html).
I also added field-id metadata on input and expected schemas (including nested
fields) and added nullable/non-nullable variations so we verify field
reconstruction does not drop IDs or alter nullability semantics.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]