This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 8f2e7878 Fix `overwrite` when filtering complete files (#1023)
8f2e7878 is described below
commit 8f2e78784c89436f48b02a0a23b3a121a3b610c4
Author: Andre Luis Anastacio <[email protected]>
AuthorDate: Fri Aug 9 05:33:26 2024 -0300
Fix `overwrite` when filtering complete files (#1023)
---
pyiceberg/table/__init__.py | 4 +++-
tests/integration/test_writes/test_writes.py | 27 ++++++++++++++++++++++++++-
2 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 662c43d5..700a8130 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -593,7 +593,9 @@ class Transaction:
filtered_df = df.filter(preserve_row_filter)
# Only rewrite if there are records being deleted
- if len(df) != len(filtered_df):
+ if len(filtered_df) == 0:
+ replaced_files.append((original_file.file, []))
+ elif len(df) != len(filtered_df):
replaced_files.append((
original_file.file,
list(
diff --git a/tests/integration/test_writes/test_writes.py
b/tests/integration/test_writes/test_writes.py
index 9b4e2f9f..540e9bb3 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -38,12 +38,13 @@ from pyiceberg.catalog.hive import HiveCatalog
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.exceptions import NoSuchTableError
+from pyiceberg.expressions import In
from pyiceberg.io.pyarrow import _dataframe_to_data_files
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table import TableProperties
from pyiceberg.transforms import IdentityTransform
-from pyiceberg.types import IntegerType, LongType, NestedField
+from pyiceberg.types import IntegerType, LongType, NestedField, StringType
from utils import _create_table
@@ -1309,3 +1310,27 @@ def
test_table_v1_with_null_nested_namespace(session_catalog: Catalog, arrow_tab
# We expect no error here
session_catalog.drop_table(identifier)
+
+
[email protected]
+def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
+ schema = Schema(
+ NestedField(1, "id", StringType(), required=True),
+ NestedField(2, "name", StringType(), required=False),
+ identifier_field_ids=[1],
+ )
+
+ data = pa.Table.from_pylist(
+ [
+ {"id": "1", "name": "Amsterdam"},
+ {"id": "2", "name": "San Francisco"},
+ {"id": "3", "name": "Drachten"},
+ ],
+ schema=schema.as_arrow(),
+ )
+
+ identifier = "default.test_overwrite_all_data_with_filter"
+ tbl = _create_table(session_catalog, identifier, data=[data],
schema=schema)
+ tbl.overwrite(data, In("id", ["1", "2", "3"]))
+
+ assert len(tbl.scan().to_arrow()) == 3