This is an automated email from the ASF dual-hosted git repository. fokko pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push: new effb8cb6 Hive: update hive storage descriptor after commit schema change (#2036) effb8cb6 is described below commit effb8cb6fac1a89744f694953d214790db641f1f Author: frankliee <frankz...@tencent.com> AuthorDate: Thu Jul 3 17:04:46 2025 +0800 Hive: update hive storage descriptor after commit schema change (#2036) <!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Like iceberg jar, we should also update hive storage descriptor after commit metadata see: https://github.com/apache/iceberg/blob/b504f9c51c6c0e0a5c0c5ff53f295e69b67d8e59/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java#L170 # Are these changes tested? new UTs # Are there any user-facing changes? No <!-- In the case of user-facing changes, please add the changelog label. --> --- dev/hive/core-site.xml | 5 +++++ pyiceberg/catalog/hive.py | 6 ++++++ tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/dev/hive/core-site.xml b/dev/hive/core-site.xml index b77332b8..f5a9473b 100644 --- a/dev/hive/core-site.xml +++ b/dev/hive/core-site.xml @@ -50,4 +50,9 @@ <name>fs.s3a.path.style.access</name> <value>true</value> </property> + <property> + <name>hive.metastore.disallow.incompatible.col.type.changes</name> + <value>false</value> + </property> + </configuration> diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 09437dd1..cc9cd028 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -561,6 +561,12 @@ class HiveCatalog(MetastoreCatalog): previous_metadata_location=current_table.metadata_location, metadata_properties=updated_staged_table.properties, ) + # Update hive's schema and properties + hive_table.sd = _construct_hive_storage_descriptor( + updated_staged_table.schema(), + updated_staged_table.location(), + property_as_bool(updated_staged_table.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), + ) open_client.alter_table_with_environment_context( dbname=database_name, tbl_name=table_name, diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index b66601f6..30a09686 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -1148,6 +1148,30 @@ def test_hive_catalog_storage_descriptor( assert spark.sql("SELECT * FROM hive.default.test_storage_descriptor").count() == 3 +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_hive_catalog_storage_descriptor_has_changed( + session_catalog_hive: HiveCatalog, + pa_schema: pa.Schema, + arrow_table_with_null: pa.Table, + spark: SparkSession, + format_version: int, +) -> None: + tbl = _create_table( + session_catalog_hive, "default.test_storage_descriptor", {"format-version": format_version}, [arrow_table_with_null] + ) + + with tbl.transaction() as tx: + with tx.update_schema() as schema: + schema.update_column("string_long", doc="this is string_long") + schema.update_column("binary", doc="this is binary") + + with session_catalog_hive._client as open_client: + hive_table = session_catalog_hive._get_hive_table(open_client, "default", "test_storage_descriptor") + assert "this is string_long" in str(hive_table.sd) + assert "this is binary" in str(hive_table.sd) + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character_partitioned(catalog: Catalog) -> None: