This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new d56ddddc Allow non-string typed values in table properties (#469)
d56ddddc is described below
commit d56ddddc4e81d9efcb1fbd57bd89ed99b424619d
Author: Kevin Liu <[email protected]>
AuthorDate: Thu Feb 29 12:49:07 2024 -0800
Allow non-string typed values in table properties (#469)
* property accept int
https://stackoverflow.com/questions/77304167/using-pydantic-to-change-int-to-string
https://docs.pydantic.dev/latest/concepts/validators/\#field-validators
* add tests
* add integration tests
* pr feedback
* make validator reusable
* show key when none
---
pyiceberg/catalog/rest.py | 9 ++++++---
pyiceberg/table/metadata.py | 6 +++++-
pyiceberg/typedef.py | 2 +-
pyiceberg/types.py | 9 +++++++++
tests/catalog/test_base.py | 22 ++++++++++++++++++++--
tests/catalog/test_sql.py | 39 ++++++++++++++++++++++++++++++++++++++-
tests/integration/test_writes.py | 39 +++++++++++++++++++++++++++++++++++++--
tests/table/test_init.py | 29 ++++++++++++++++++++++++++++-
8 files changed, 144 insertions(+), 11 deletions(-)
diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py
index e0e88ec4..2adeafb5 100644
--- a/pyiceberg/catalog/rest.py
+++ b/pyiceberg/catalog/rest.py
@@ -28,7 +28,7 @@ from typing import (
Union,
)
-from pydantic import Field, ValidationError
+from pydantic import Field, ValidationError, field_validator
from requests import HTTPError, Session
from tenacity import RetryCallState, retry, retry_if_exception_type,
stop_after_attempt
@@ -69,6 +69,7 @@ from pyiceberg.table import (
)
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder,
assign_fresh_sort_order_ids
from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel
+from pyiceberg.types import transform_dict_value_to_str
if TYPE_CHECKING:
import pyarrow as pa
@@ -147,6 +148,8 @@ class CreateTableRequest(IcebergBaseModel):
write_order: Optional[SortOrder] = Field(alias="write-order")
stage_create: bool = Field(alias="stage-create", default=False)
properties: Properties = Field(default_factory=dict)
+ # validators
+ transform_properties_dict_value_to_str = field_validator('properties',
mode='before')(transform_dict_value_to_str)
class RegisterTableRequest(IcebergBaseModel):
@@ -234,9 +237,9 @@ class RestCatalog(Catalog):
# Sets the client side and server side SSL cert verification, if
provided as properties.
if ssl_config := self.properties.get(SSL):
- if ssl_ca_bundle := ssl_config.get(CA_BUNDLE): # type: ignore
+ if ssl_ca_bundle := ssl_config.get(CA_BUNDLE):
session.verify = ssl_ca_bundle
- if ssl_client := ssl_config.get(CLIENT): # type: ignore
+ if ssl_client := ssl_config.get(CLIENT):
if all(k in ssl_client for k in (CERT, KEY)):
session.cert = (ssl_client[CERT], ssl_client[KEY])
elif ssl_client_cert := ssl_client.get(CERT):
diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py
index c7169151..931b0cfe 100644
--- a/pyiceberg/table/metadata.py
+++ b/pyiceberg/table/metadata.py
@@ -28,7 +28,7 @@ from typing import (
Union,
)
-from pydantic import Field, model_validator
+from pydantic import Field, field_validator, model_validator
from pydantic import ValidationError as PydanticValidationError
from typing_extensions import Annotated
@@ -49,6 +49,7 @@ from pyiceberg.typedef import (
IcebergRootModel,
Properties,
)
+from pyiceberg.types import transform_dict_value_to_str
from pyiceberg.utils.datetime import datetime_to_millis
CURRENT_SNAPSHOT_ID = "current-snapshot-id"
@@ -218,6 +219,9 @@ class TableMetadataCommonFields(IcebergBaseModel):
There is always a main branch reference pointing to the
current-snapshot-id even if the refs map is null."""
+ # validators
+ transform_properties_dict_value_to_str = field_validator('properties',
mode='before')(transform_dict_value_to_str)
+
def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]:
"""Get the snapshot by snapshot_id."""
return next((snapshot for snapshot in self.snapshots if
snapshot.snapshot_id == snapshot_id), None)
diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py
index 56a3d3c7..e57bf349 100644
--- a/pyiceberg/typedef.py
+++ b/pyiceberg/typedef.py
@@ -73,7 +73,7 @@ class KeyDefaultDict(Dict[K, V]):
Identifier = Tuple[str, ...]
-Properties = Dict[str, str]
+Properties = Dict[str, Any]
RecursiveDict = Dict[str, Union[str, "RecursiveDict"]]
# Represents the literal value
diff --git a/pyiceberg/types.py b/pyiceberg/types.py
index eb215121..746f03ea 100644
--- a/pyiceberg/types.py
+++ b/pyiceberg/types.py
@@ -37,6 +37,7 @@ from functools import cached_property
from typing import (
Any,
ClassVar,
+ Dict,
Literal,
Optional,
Tuple,
@@ -61,6 +62,14 @@ FIXED = "fixed"
FIXED_PARSER = ParseNumberFromBrackets(FIXED)
+def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]:
+ """Transform all values in the dictionary to string. Raise an error if any
value is None."""
+ for key, value in dict.items():
+ if value is None:
+ raise ValueError(f"None type is not a supported value in
properties: {key}")
+ return {k: str(v) for k, v in dict.items()}
+
+
def _parse_decimal_type(decimal: Any) -> Tuple[int, int]:
if isinstance(decimal, str):
matches = DECIMAL_REGEX.search(decimal)
diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py
index 21c7ec15..c7d3f01f 100644
--- a/tests/catalog/test_base.py
+++ b/tests/catalog/test_base.py
@@ -26,6 +26,7 @@ from typing import (
import pyarrow as pa
import pytest
+from pydantic_core import ValidationError
from pytest_lazyfixture import lazy_fixture
from pyiceberg.catalog import (
@@ -255,13 +256,16 @@ NO_SUCH_NAMESPACE_ERROR = "Namespace does not exist:
\\('com', 'organization', '
NAMESPACE_NOT_EMPTY_ERROR = "Namespace is not empty: \\('com', 'organization',
'department'\\)"
-def given_catalog_has_a_table(catalog: InMemoryCatalog) -> Table:
+def given_catalog_has_a_table(
+ catalog: InMemoryCatalog,
+ properties: Properties = EMPTY_DICT,
+) -> Table:
return catalog.create_table(
identifier=TEST_TABLE_IDENTIFIER,
schema=TEST_TABLE_SCHEMA,
location=TEST_TABLE_LOCATION,
partition_spec=TEST_TABLE_PARTITION_SPEC,
- properties=TEST_TABLE_PROPERTIES,
+ properties=properties or TEST_TABLE_PROPERTIES,
)
@@ -661,3 +665,17 @@ def test_add_column_with_statement(catalog:
InMemoryCatalog) -> None:
def test_catalog_repr(catalog: InMemoryCatalog) -> None:
s = repr(catalog)
assert s == "test.in.memory.catalog (<class 'test_base.InMemoryCatalog'>)"
+
+
+def test_table_properties_int_value(catalog: InMemoryCatalog) -> None:
+ # table properties can be set to int, but still serialized to string
+ property_with_int = {"property_name": 42}
+ given_table = given_catalog_has_a_table(catalog,
properties=property_with_int)
+ assert isinstance(given_table.properties["property_name"], str)
+
+
+def test_table_properties_raise_for_none_value(catalog: InMemoryCatalog) ->
None:
+ property_with_none = {"property_name": None}
+ with pytest.raises(ValidationError) as exc_info:
+ _ = given_catalog_has_a_table(catalog, properties=property_with_none)
+ assert "None type is not a supported value in properties: property_name"
in str(exc_info.value)
diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py
index 9f4d4af4..0b869d68 100644
--- a/tests/catalog/test_sql.py
+++ b/tests/catalog/test_sql.py
@@ -21,6 +21,7 @@ from typing import Generator, List
import pyarrow as pa
import pytest
+from pydantic_core import ValidationError
from pytest_lazyfixture import lazy_fixture
from sqlalchemy.exc import ArgumentError, IntegrityError
@@ -640,7 +641,7 @@ def test_create_namespace_with_null_properties(catalog:
SqlCatalog, database_nam
catalog.create_namespace(namespace=database_name, properties={None:
"value"}) # type: ignore
with pytest.raises(IntegrityError):
- catalog.create_namespace(namespace=database_name, properties={"key":
None}) # type: ignore
+ catalog.create_namespace(namespace=database_name, properties={"key":
None})
@pytest.mark.parametrize(
@@ -915,3 +916,39 @@ def test_write_and_evolve(catalog: SqlCatalog,
format_version: int) -> None:
with txn.update_snapshot().fast_append() as snapshot_update:
for data_file in
_dataframe_to_data_files(table_metadata=txn.table_metadata,
df=pa_table_with_column, io=tbl.io):
snapshot_update.append_data_file(data_file)
+
+
[email protected](
+ 'catalog',
+ [
+ lazy_fixture('catalog_memory'),
+ lazy_fixture('catalog_sqlite'),
+ lazy_fixture('catalog_sqlite_without_rowcount'),
+ ],
+)
+def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple:
Schema, random_identifier: Identifier) -> None:
+ # table properties can be set to int, but still serialized to string
+ database_name, _table_name = random_identifier
+ catalog.create_namespace(database_name)
+ property_with_int = {"property_name": 42}
+ table = catalog.create_table(random_identifier, table_schema_simple,
properties=property_with_int)
+ assert isinstance(table.properties["property_name"], str)
+
+
[email protected](
+ 'catalog',
+ [
+ lazy_fixture('catalog_memory'),
+ lazy_fixture('catalog_sqlite'),
+ lazy_fixture('catalog_sqlite_without_rowcount'),
+ ],
+)
+def test_table_properties_raise_for_none_value(
+ catalog: SqlCatalog, table_schema_simple: Schema, random_identifier:
Identifier
+) -> None:
+ database_name, _table_name = random_identifier
+ catalog.create_namespace(database_name)
+ property_with_none = {"property_name": None}
+ with pytest.raises(ValidationError) as exc_info:
+ _ = catalog.create_table(random_identifier, table_schema_simple,
properties=property_with_none)
+ assert "None type is not a supported value in properties: property_name"
in str(exc_info.value)
diff --git a/tests/integration/test_writes.py b/tests/integration/test_writes.py
index 388e566b..1eeec2b3 100644
--- a/tests/integration/test_writes.py
+++ b/tests/integration/test_writes.py
@@ -28,6 +28,7 @@ import pyarrow.parquet as pq
import pytest
import pytz
from pyarrow.fs import S3FileSystem
+from pydantic_core import ValidationError
from pyspark.sql import SparkSession
from pytest_mock.plugin import MockerFixture
@@ -403,7 +404,7 @@ def test_data_files(spark: SparkSession, session_catalog:
Catalog, arrow_table_w
@pytest.mark.integration
[email protected]("format_version", ["1", "2"])
[email protected]("format_version", [1, 2])
@pytest.mark.parametrize(
"properties, expected_compression_name",
[
@@ -419,7 +420,7 @@ def test_write_parquet_compression_properties(
spark: SparkSession,
session_catalog: Catalog,
arrow_table_with_null: pa.Table,
- format_version: str,
+ format_version: int,
properties: Dict[str, Any],
expected_compression_name: str,
) -> None:
@@ -654,3 +655,37 @@ def test_write_and_evolve(session_catalog: Catalog,
format_version: int) -> None
with txn.update_snapshot().fast_append() as snapshot_update:
for data_file in
_dataframe_to_data_files(table_metadata=txn.table_metadata,
df=pa_table_with_column, io=tbl.io):
snapshot_update.append_data_file(data_file)
+
+
[email protected]
[email protected]("format_version", [1, 2])
+def test_table_properties_int_value(
+ session_catalog: Catalog,
+ arrow_table_with_null: pa.Table,
+ format_version: int,
+) -> None:
+ # table properties can be set to int, but still serialized to string
+ property_with_int = {"property_name": 42}
+ identifier = "default.test_table_properties_int_value"
+
+ tbl = _create_table(
+ session_catalog, identifier, {"format-version": format_version,
**property_with_int}, [arrow_table_with_null]
+ )
+ assert isinstance(tbl.properties["property_name"], str)
+
+
[email protected]
[email protected]("format_version", [1, 2])
+def test_table_properties_raise_for_none_value(
+ session_catalog: Catalog,
+ arrow_table_with_null: pa.Table,
+ format_version: int,
+) -> None:
+ property_with_none = {"property_name": None}
+ identifier = "default.test_table_properties_raise_for_none_value"
+
+ with pytest.raises(ValidationError) as exc_info:
+ _ = _create_table(
+ session_catalog, identifier, {"format-version": format_version,
**property_with_none}, [arrow_table_with_null]
+ )
+ assert "None type is not a supported value in properties: property_name"
in str(exc_info.value)
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
index e6407b60..04efc5f4 100644
--- a/tests/table/test_init.py
+++ b/tests/table/test_init.py
@@ -17,10 +17,11 @@
# pylint:disable=redefined-outer-name
import uuid
from copy import copy
-from typing import Dict
+from typing import Any, Dict
import pyarrow as pa
import pytest
+from pydantic import ValidationError
from sortedcontainers import SortedList
from pyiceberg.catalog.noop import NoopCatalog
@@ -1081,3 +1082,29 @@ def
test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
with pytest.raises(ValueError, match=expected):
_check_schema(table_schema_simple, other_schema)
+
+
+def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None:
+ # metadata properties are all strings
+ for k, v in example_table_metadata_v2["properties"].items():
+ assert isinstance(k, str)
+ assert isinstance(v, str)
+ metadata = TableMetadataV2(**example_table_metadata_v2)
+ for k, v in metadata.properties.items():
+ assert isinstance(k, str)
+ assert isinstance(v, str)
+
+ # property can be set to int, but still serialized as string
+ property_with_int = {"property_name": 42}
+ new_example_table_metadata_v2 = {**example_table_metadata_v2,
"properties": property_with_int}
+ assert
isinstance(new_example_table_metadata_v2["properties"]["property_name"], int)
+ new_metadata = TableMetadataV2(**new_example_table_metadata_v2)
+ assert isinstance(new_metadata.properties["property_name"], str)
+
+
+def test_table_properties_raise_for_none_value(example_table_metadata_v2:
Dict[str, Any]) -> None:
+ property_with_none = {"property_name": None}
+ example_table_metadata_v2 = {**example_table_metadata_v2, "properties":
property_with_none}
+ with pytest.raises(ValidationError) as exc_info:
+ TableMetadataV2(**example_table_metadata_v2)
+ assert "None type is not a supported value in properties: property_name"
in str(exc_info.value)