(iceberg-python) branch main updated: Add support for categorical type (#693)

fokko Tue, 07 May 2024 13:18:57 -0700

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 05086675 Add support for categorical type (#693)
05086675 is described below

commit 05086675d267ee1df271a8ec631e83091b7be114
Author: Sung Yun <[email protected]>
AuthorDate: Tue May 7 16:18:48 2024 -0400

    Add support for categorical type (#693)
---
 pyiceberg/io/pyarrow.py                      | 10 ++++++++++
 tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++++++
 tests/io/test_pyarrow_visitor.py             | 14 ++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 72b386d2..9216c37f 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -731,6 +731,16 @@ def _(obj: pa.MapType, visitor: PyArrowSchemaVisitor[T]) 
-> T:
     return visitor.map(obj, key_result, value_result)
 
 
+@visit_pyarrow.register(pa.DictionaryType)
+def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T:
+    # Parquet has no dictionary type. dictionary-encoding is handled
+    # as an encoding detail, not as a separate type.
+    # We will follow this approach in determining the Iceberg Type,
+    # as we only support parquet in PyIceberg for now.
+    logger.warning(f"Iceberg does not have a dictionary type. {type(obj)} will 
be inferred as {obj.value_type} on read.")
+    return visit_pyarrow(obj.value_type, visitor)
+
+
 @visit_pyarrow.register(pa.DataType)
 def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T:
     if pa.types.is_nested(obj):
diff --git a/tests/integration/test_writes/test_writes.py 
b/tests/integration/test_writes/test_writes.py
index 8bebc53d..a4a93396 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -315,6 +315,30 @@ def 
test_python_writes_special_character_column_with_spark_reads(
     assert spark_df.equals(pyiceberg_df)
 
 
[email protected]
[email protected]("format_version", [1, 2])
+def test_python_writes_dictionary_encoded_column_with_spark_reads(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    identifier = 
"default.python_writes_dictionary_encoded_column_with_spark_reads"
+    TEST_DATA = {
+        'id': [1, 2, 3, 1, 1],
+        'name': ['AB', 'CD', 'EF', 'CD', 'EF'],
+    }
+    pa_schema = pa.schema([
+        pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)),
+        pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)),
+    ])
+    arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
+
+    tbl = _create_table(session_catalog, identifier, {"format-version": 
format_version}, schema=pa_schema)
+
+    tbl.overwrite(arrow_table)
+    spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas()
+    pyiceberg_df = tbl.scan().to_pandas()
+    assert spark_df.equals(pyiceberg_df)
+
+
 @pytest.mark.integration
 def test_write_bin_pack_data_files(spark: SparkSession, session_catalog: 
Catalog, arrow_table_with_null: pa.Table) -> None:
     identifier = "default.write_bin_pack_data_files"
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
index 5b55bd61..46ad331a 100644
--- a/tests/io/test_pyarrow_visitor.py
+++ b/tests/io/test_pyarrow_visitor.py
@@ -39,6 +39,7 @@ from pyiceberg.types import (
     DoubleType,
     FixedType,
     FloatType,
+    IcebergType,
     IntegerType,
     ListType,
     LongType,
@@ -280,6 +281,19 @@ def test_pyarrow_map_to_iceberg() -> None:
     assert visit_pyarrow(pyarrow_map, _ConvertToIceberg()) == expected
 
 
[email protected](
+    "value_type, expected_result",
+    [
+        (pa.string(), StringType()),
+        (pa.int32(), IntegerType()),
+        (pa.float64(), DoubleType()),
+    ],
+)
+def test_pyarrow_dictionary_encoded_type_to_iceberg(value_type: pa.DataType, 
expected_result: IcebergType) -> None:
+    pyarrow_dict = pa.dictionary(pa.int32(), value_type)
+    assert visit_pyarrow(pyarrow_dict, _ConvertToIceberg()) == expected_result
+
+
 def test_round_schema_conversion_simple(table_schema_simple: Schema) -> None:
     actual = str(pyarrow_to_schema(schema_to_pyarrow(table_schema_simple)))
     expected = """table {

(iceberg-python) branch main updated: Add support for categorical type (#693)

Reply via email to