Fokko commented on code in PR #5949:
URL: https://github.com/apache/iceberg/pull/5949#discussion_r996698684
##########
python/pyiceberg/io/pyarrow.py:
##########
@@ -239,3 +266,102 @@ def delete(self, location: Union[str, InputFile,
OutputFile]) -> None:
elif e.errno == 13 or "AWS Error [code 15]" in str(e):
raise PermissionError(f"Cannot delete file, access denied:
{location}") from e
raise # pragma: no cover - If some other kind of OSError, raise
the raw error
+
+
+def convert_iceberg_schema_to_pyarrow(schema: Schema) -> pa.schema:
+ return visit(schema, _ConvertToArrowSchema())
+
+
+class _ConvertToArrowSchema(SchemaVisitor[pa.DataType]):
+ def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema:
+ return pa.schema(list(struct_result))
+
+ def struct(self, _: StructType, field_results: List[pa.DataType]) ->
pa.DataType:
+ return pa.struct(field_results)
+
+ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
+ return pa.field(
+ name=field.name, type=field_result, nullable=not field.required,
metadata={"doc": field.doc} if field.doc else {}
+ )
+
+ def list(self, _: ListType, element_result: pa.DataType) -> pa.DataType:
+ return pa.list_(value_type=element_result)
+
+ def map(self, _: MapType, key_result: pa.DataType, value_result:
pa.DataType) -> pa.DataType:
+ return pa.map_(key_type=key_result, item_type=value_result)
Review Comment:
The default is null. This isn't part of the docs, so I just gave it a swing:
```python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, ArrayType, IntegerType
data = [([0, None, 3], [4, 5, 6]),
([1, None, 6], [8, 2])]
schema = StructType([
StructField('arr_with_null', ArrayType(IntegerType(), True), True),
StructField('arr_without_null', ArrayType(IntegerType(), False), True)
])
rdd = spark.sparkContext.parallelize(data)
df = spark.createDataFrame(rdd, schema)
df.write.saveAsTable("fokko.lists")
df.show()
StructType([
StructField('arr_with_null', ArrayType(IntegerType(), True), True),
StructField('arr_without_null', ArrayType(IntegerType(), False), True)
])
+-------------+----------------+
|arr_with_null|arr_without_null|
+-------------+----------------+
| [0, null, 3]| [4, 5, 6]|
| [1, null, 6]| [8, 2]|
+-------------+----------------+
```
And PyArrow seems to be fine with it:
```python
In [3]: from pyiceberg.catalog import load_catalog
...:
...: cat = load_catalog('rest')
...:
...: tbl = cat.load_table(('nyc', 'lists'))
...:
...: scan = tbl.new_scan()
...:
...: ds = scan.dataset
...:
...: table = ds.to_table()
In [4]: table
Out[4]:
pyarrow.Table
arr_with_null: list<item: int32>
child 0, item: int32
arr_without_null: list<item: int32>
child 0, item: int32
----
arr_with_null: [[[0,null,3]],[[1,null,6]]]
arr_without_null: [[[4,5,6]],[[8,2]]]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]