(iceberg-python) branch main updated: Hive: update hive storage descriptor after commit schema change (#2036)

fokko Thu, 03 Jul 2025 02:04:56 -0700

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new effb8cb6 Hive: update hive storage descriptor after commit schema 
change (#2036)
effb8cb6 is described below

commit effb8cb6fac1a89744f694953d214790db641f1f
Author: frankliee <frankz...@tencent.com>
AuthorDate: Thu Jul 3 17:04:46 2025 +0800

    Hive: update hive storage descriptor after commit schema change (#2036)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    Like iceberg jar, we should also update hive storage descriptor after
    commit metadata
    
    see:
    
https://github.com/apache/iceberg/blob/b504f9c51c6c0e0a5c0c5ff53f295e69b67d8e59/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java#L170
    
    # Are these changes tested?
    new UTs
    
    # Are there any user-facing changes?
    No
    
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 dev/hive/core-site.xml                       |  5 +++++
 pyiceberg/catalog/hive.py                    |  6 ++++++
 tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/dev/hive/core-site.xml b/dev/hive/core-site.xml
index b77332b8..f5a9473b 100644
--- a/dev/hive/core-site.xml
+++ b/dev/hive/core-site.xml
@@ -50,4 +50,9 @@
       <name>fs.s3a.path.style.access</name>
       <value>true</value>
     </property>
+    <property>
+      <name>hive.metastore.disallow.incompatible.col.type.changes</name>
+      <value>false</value>
+    </property>
+
 </configuration>
diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py
index 09437dd1..cc9cd028 100644
--- a/pyiceberg/catalog/hive.py
+++ b/pyiceberg/catalog/hive.py
@@ -561,6 +561,12 @@ class HiveCatalog(MetastoreCatalog):
                         
previous_metadata_location=current_table.metadata_location,
                         metadata_properties=updated_staged_table.properties,
                     )
+                    # Update hive's schema and properties
+                    hive_table.sd = _construct_hive_storage_descriptor(
+                        updated_staged_table.schema(),
+                        updated_staged_table.location(),
+                        property_as_bool(updated_staged_table.properties, 
HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT),
+                    )
                     open_client.alter_table_with_environment_context(
                         dbname=database_name,
                         tbl_name=table_name,
diff --git a/tests/integration/test_writes/test_writes.py 
b/tests/integration/test_writes/test_writes.py
index b66601f6..30a09686 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -1148,6 +1148,30 @@ def test_hive_catalog_storage_descriptor(
     assert spark.sql("SELECT * FROM 
hive.default.test_storage_descriptor").count() == 3
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_hive_catalog_storage_descriptor_has_changed(
+    session_catalog_hive: HiveCatalog,
+    pa_schema: pa.Schema,
+    arrow_table_with_null: pa.Table,
+    spark: SparkSession,
+    format_version: int,
+) -> None:
+    tbl = _create_table(
+        session_catalog_hive, "default.test_storage_descriptor", 
{"format-version": format_version}, [arrow_table_with_null]
+    )
+
+    with tbl.transaction() as tx:
+        with tx.update_schema() as schema:
+            schema.update_column("string_long", doc="this is string_long")
+            schema.update_column("binary", doc="this is binary")
+
+    with session_catalog_hive._client as open_client:
+        hive_table = session_catalog_hive._get_hive_table(open_client, 
"default", "test_storage_descriptor")
+        assert "this is string_long" in str(hive_table.sd)
+        assert "this is binary" in str(hive_table.sd)
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("catalog", 
[pytest.lazy_fixture("session_catalog_hive"), 
pytest.lazy_fixture("session_catalog")])
 def test_sanitize_character_partitioned(catalog: Catalog) -> None:

(iceberg-python) branch main updated: Hive: update hive storage descriptor after commit schema change (#2036)

Reply via email to