[GitHub] [iceberg] Fokko commented on a diff in pull request #6437: Python: Projection by Field ID

GitBox Tue, 20 Dec 2022 11:08:42 -0800


Fokko commented on code in PR #6437:
URL: https://github.com/apache/iceberg/pull/6437#discussion_r1053657871



##########
python/tests/io/test_pyarrow.py:
##########
@@ -572,3 +581,388 @@ def test_always_true_to_pyarrow(bound_reference: 
BoundReference[str]) -> None:
 
 def test_always_false_to_pyarrow(bound_reference: BoundReference[str]) -> None:
     assert repr(expression_to_pyarrow(AlwaysFalse())) == 
"<pyarrow.compute.Expression false>"
+
+
[email protected]
+def schema_int() -> Schema:
+    return Schema(NestedField(1, "id", IntegerType()), schema_id=1)
+
+
[email protected]
+def schema_str() -> Schema:
+    return Schema(NestedField(2, "data", IntegerType()), schema_id=1)
+
+
[email protected]
+def schema_long() -> Schema:
+    return Schema(NestedField(3, "id", LongType()), schema_id=1)
+
+
[email protected]
+def table_int(schema_int: Schema, tmpdir: str) -> str:
+    pyarrow_schema = pa.schema(schema_to_pyarrow(schema_int), 
metadata={"iceberg.schema": schema_int.json()})
+
+    target_file = f"file:{tmpdir}/a.parquet"
+
+    with pq.ParquetWriter(target_file, pyarrow_schema) as writer:
+        writer.write_table(pa.Table.from_arrays([pa.array([0, 1, 2])], 
schema=pyarrow_schema))
+
+    return target_file
+
+
[email protected]
+def table_str(schema_str: Schema, tmpdir: str) -> str:
+    pyarrow_schema = pa.schema(schema_to_pyarrow(schema_str), 
metadata={"iceberg.schema": schema_str.json()})
+
+    target_file = f"file:{tmpdir}/b.parquet"
+
+    with pq.ParquetWriter(target_file, pyarrow_schema) as writer:
+        writer.write_table(pa.Table.from_arrays([pa.array([0, 1, 2])], 
schema=pyarrow_schema))
+
+    return target_file
+
+
[email protected]
+def table_long(schema_long: Schema, tmpdir: str) -> str:
+    pyarrow_schema = pa.schema(schema_to_pyarrow(schema_long), 
metadata={"iceberg.schema": schema_long.json()})
+
+    target_file = f"file:{tmpdir}/c.parquet"
+
+    with pq.ParquetWriter(target_file, pyarrow_schema) as writer:
+        writer.write_table(pa.Table.from_arrays([pa.array([0, 1, 2])], 
schema=pyarrow_schema))
+
+    return target_file
+
+
+def test_projection_add_column(schema_int: Schema, table_int: str) -> None:
+    schema = Schema(
+        # All new IDs
+        NestedField(10, "id", IntegerType(), required=False),
+        NestedField(20, "list", ListType(21, IntegerType(), 
element_required=False), required=False),
+        NestedField(
+            30,
+            "map",
+            MapType(key_id=31, key_type=IntegerType(), value_id=32, 
value_type=StringType(), value_required=False),
+            required=False,
+        ),
+        NestedField(40, "location", StructType(NestedField(41, "lat", 
DoubleType()), NestedField(42, "lon", DoubleType()))),
+    )
+    result_table = project_table(
+        [
+            FileScanTask(
+                DataFile(file_path=table_int, file_format=FileFormat.PARQUET, 
partition={}, record_count=3, file_size_in_bytes=3)
+            )
+        ],
+        Table(
+            ("namespace", "table"),
+            metadata=TableMetadataV2(
+                location="file://a/b/c.parquet",
+                last_column_id=1,
+                format_version=2,
+                schemas=[schema],
+                partition_specs=[PartitionSpec()],
+            ),
+            metadata_location="file://a/b/c.parquet",
+            io=PyArrowFileIO(),
+        ),
+        AlwaysTrue(),
+        schema,
+        case_sensitive=True,
+    )
+
+    # Everything should be None
+    for col in result_table.columns:
+        for r in col:
+            assert r.as_py() is None
+
+    assert (
+        repr(result_table.schema)
+        == """id: int32
+list: list<item: int32>
+  child 0, item: int32
+map: map<int32, string>
+  child 0, entries: struct<key: int32 not null, value: string> not null
+      child 0, key: int32 not null
+      child 1, value: string
+location: struct<lat: double not null, lon: double not null> not null
+  child 0, lat: double not null
+  child 1, lon: double not null"""
+    )
+
+
+def test_projection_add_column_struct(schema_int: Schema, table_int: str) -> 
None:
+    schema = Schema(
+        # A new ID
+        NestedField(
+            2,
+            "other_id",
+            MapType(key_id=3, key_type=IntegerType(), value_id=4, 
value_type=StringType(), value_required=False),
+            required=False,
+        )
+    )
+    result_table = project_table(
+        [
+            FileScanTask(
+                DataFile(file_path=table_int, file_format=FileFormat.PARQUET, 
partition={}, record_count=3, file_size_in_bytes=3)
+            )
+        ],
+        Table(
+            ("namespace", "table"),
+            metadata=TableMetadataV2(
+                location="file://a/b/c.parquet",
+                last_column_id=1,
+                format_version=2,
+                schemas=[schema],
+                partition_specs=[PartitionSpec()],
+            ),
+            metadata_location="file://a/b/c.parquet",
+            io=PyArrowFileIO(),
+        ),
+        AlwaysTrue(),
+        schema,
+        case_sensitive=True,
+    )
+    # Everything should be None
+    for r in result_table.columns[0]:

Review Comment:
   Good suggestion, updated!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #6437: Python: Projection by Field ID

Reply via email to