This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 05086675 Add support for categorical type (#693)
05086675 is described below
commit 05086675d267ee1df271a8ec631e83091b7be114
Author: Sung Yun <[email protected]>
AuthorDate: Tue May 7 16:18:48 2024 -0400
Add support for categorical type (#693)
---
pyiceberg/io/pyarrow.py | 10 ++++++++++
tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++++++
tests/io/test_pyarrow_visitor.py | 14 ++++++++++++++
3 files changed, 48 insertions(+)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 72b386d2..9216c37f 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -731,6 +731,16 @@ def _(obj: pa.MapType, visitor: PyArrowSchemaVisitor[T])
-> T:
return visitor.map(obj, key_result, value_result)
+@visit_pyarrow.register(pa.DictionaryType)
+def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T:
+ # Parquet has no dictionary type. dictionary-encoding is handled
+ # as an encoding detail, not as a separate type.
+ # We will follow this approach in determining the Iceberg Type,
+ # as we only support parquet in PyIceberg for now.
+ logger.warning(f"Iceberg does not have a dictionary type. {type(obj)} will
be inferred as {obj.value_type} on read.")
+ return visit_pyarrow(obj.value_type, visitor)
+
+
@visit_pyarrow.register(pa.DataType)
def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T:
if pa.types.is_nested(obj):
diff --git a/tests/integration/test_writes/test_writes.py
b/tests/integration/test_writes/test_writes.py
index 8bebc53d..a4a93396 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -315,6 +315,30 @@ def
test_python_writes_special_character_column_with_spark_reads(
assert spark_df.equals(pyiceberg_df)
[email protected]
[email protected]("format_version", [1, 2])
+def test_python_writes_dictionary_encoded_column_with_spark_reads(
+ spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+ identifier =
"default.python_writes_dictionary_encoded_column_with_spark_reads"
+ TEST_DATA = {
+ 'id': [1, 2, 3, 1, 1],
+ 'name': ['AB', 'CD', 'EF', 'CD', 'EF'],
+ }
+ pa_schema = pa.schema([
+ pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)),
+ pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)),
+ ])
+ arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
+
+ tbl = _create_table(session_catalog, identifier, {"format-version":
format_version}, schema=pa_schema)
+
+ tbl.overwrite(arrow_table)
+ spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas()
+ pyiceberg_df = tbl.scan().to_pandas()
+ assert spark_df.equals(pyiceberg_df)
+
+
@pytest.mark.integration
def test_write_bin_pack_data_files(spark: SparkSession, session_catalog:
Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.write_bin_pack_data_files"
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
index 5b55bd61..46ad331a 100644
--- a/tests/io/test_pyarrow_visitor.py
+++ b/tests/io/test_pyarrow_visitor.py
@@ -39,6 +39,7 @@ from pyiceberg.types import (
DoubleType,
FixedType,
FloatType,
+ IcebergType,
IntegerType,
ListType,
LongType,
@@ -280,6 +281,19 @@ def test_pyarrow_map_to_iceberg() -> None:
assert visit_pyarrow(pyarrow_map, _ConvertToIceberg()) == expected
[email protected](
+ "value_type, expected_result",
+ [
+ (pa.string(), StringType()),
+ (pa.int32(), IntegerType()),
+ (pa.float64(), DoubleType()),
+ ],
+)
+def test_pyarrow_dictionary_encoded_type_to_iceberg(value_type: pa.DataType,
expected_result: IcebergType) -> None:
+ pyarrow_dict = pa.dictionary(pa.int32(), value_type)
+ assert visit_pyarrow(pyarrow_dict, _ConvertToIceberg()) == expected_result
+
+
def test_round_schema_conversion_simple(table_schema_simple: Schema) -> None:
actual = str(pyarrow_to_schema(schema_to_pyarrow(table_schema_simple)))
expected = """table {