(iceberg-python) 09/11: Fix `overwrite` when filtering complete files (#1023)

sungwy Fri, 09 Aug 2024 07:17:32 -0700

This is an automated email from the ASF dual-hosted git repository.

sungwy pushed a commit to branch pyiceberg-0.7.x
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


commit 7c9e0636b030a8e70146c357cca564018af085bc
Author: Andre Luis Anastacio <[email protected]>
AuthorDate: Fri Aug 9 05:33:26 2024 -0300

    Fix `overwrite` when filtering complete files (#1023)
---
 pyiceberg/table/__init__.py                  |  4 ++-
 tests/integration/test_writes/test_writes.py | 41 +++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 6ea78679..98bb8855 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -622,7 +622,9 @@ class Transaction:
                 filtered_df = df.filter(preserve_row_filter)
 
                 # Only rewrite if there are records being deleted
-                if len(df) != len(filtered_df):
+                if len(filtered_df) == 0:
+                    replaced_files.append((original_file.file, []))
+                elif len(df) != len(filtered_df):
                     replaced_files.append((
                         original_file.file,
                         list(
diff --git a/tests/integration/test_writes/test_writes.py 
b/tests/integration/test_writes/test_writes.py
index 8ea51f4b..f9ff7d10 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -38,12 +38,13 @@ from pyiceberg.catalog.hive import HiveCatalog
 from pyiceberg.catalog.rest import RestCatalog
 from pyiceberg.catalog.sql import SqlCatalog
 from pyiceberg.exceptions import NoSuchTableError
+from pyiceberg.expressions import In
 from pyiceberg.io.pyarrow import _dataframe_to_data_files
 from pyiceberg.partitioning import PartitionField, PartitionSpec
 from pyiceberg.schema import Schema
 from pyiceberg.table import TableProperties
 from pyiceberg.transforms import IdentityTransform
-from pyiceberg.types import IntegerType, LongType, NestedField
+from pyiceberg.types import IntegerType, LongType, NestedField, StringType
 from utils import _create_table
 
 
@@ -1293,3 +1294,41 @@ def 
test_rest_catalog_with_empty_catalog_name_append_data(session_catalog: Catal
     )
     tbl = _create_table(test_catalog, identifier, data=[])
     tbl.append(arrow_table_with_null)
+
+
[email protected]
+def test_table_v1_with_null_nested_namespace(session_catalog: Catalog, 
arrow_table_with_null: pa.Table) -> None:
+    identifier = "default.lower.table_v1_with_null_nested_namespace"
+    tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, 
[arrow_table_with_null])
+    assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
+    # TODO: Add session_catalog.table_exists check here when we integrate a 
REST catalog image
+    # that supports HEAD request on table endpoint
+
+    # assert session_catalog.table_exists(identifier)
+
+    # We expect no error here
+    session_catalog.drop_table(identifier)
+
+
[email protected]
+def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
+    schema = Schema(
+        NestedField(1, "id", StringType(), required=True),
+        NestedField(2, "name", StringType(), required=False),
+        identifier_field_ids=[1],
+    )
+
+    data = pa.Table.from_pylist(
+        [
+            {"id": "1", "name": "Amsterdam"},
+            {"id": "2", "name": "San Francisco"},
+            {"id": "3", "name": "Drachten"},
+        ],
+        schema=schema.as_arrow(),
+    )
+
+    identifier = "default.test_overwrite_all_data_with_filter"
+    tbl = _create_table(session_catalog, identifier, data=[data], 
schema=schema)
+    tbl.overwrite(data, In("id", ["1", "2", "3"]))
+
+    assert len(tbl.scan().to_arrow()) == 3

(iceberg-python) 09/11: Fix `overwrite` when filtering complete files (#1023)

Reply via email to