This is an automated email from the ASF dual-hosted git repository. sungwy pushed a commit to branch pyiceberg-0.7.x in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
commit 7c9e0636b030a8e70146c357cca564018af085bc Author: Andre Luis Anastacio <[email protected]> AuthorDate: Fri Aug 9 05:33:26 2024 -0300 Fix `overwrite` when filtering complete files (#1023) --- pyiceberg/table/__init__.py | 4 ++- tests/integration/test_writes/test_writes.py | 41 +++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 6ea78679..98bb8855 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -622,7 +622,9 @@ class Transaction: filtered_df = df.filter(preserve_row_filter) # Only rewrite if there are records being deleted - if len(df) != len(filtered_df): + if len(filtered_df) == 0: + replaced_files.append((original_file.file, [])) + elif len(df) != len(filtered_df): replaced_files.append(( original_file.file, list( diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 8ea51f4b..f9ff7d10 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -38,12 +38,13 @@ from pyiceberg.catalog.hive import HiveCatalog from pyiceberg.catalog.rest import RestCatalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.expressions import In from pyiceberg.io.pyarrow import _dataframe_to_data_files from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table import TableProperties from pyiceberg.transforms import IdentityTransform -from pyiceberg.types import IntegerType, LongType, NestedField +from pyiceberg.types import IntegerType, LongType, NestedField, StringType from utils import _create_table @@ -1293,3 +1294,41 @@ def test_rest_catalog_with_empty_catalog_name_append_data(session_catalog: Catal ) tbl = _create_table(test_catalog, identifier, data=[]) tbl.append(arrow_table_with_null) + + [email protected] +def test_table_v1_with_null_nested_namespace(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: + identifier = "default.lower.table_v1_with_null_nested_namespace" + tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, [arrow_table_with_null]) + assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}" + # TODO: Add session_catalog.table_exists check here when we integrate a REST catalog image + # that supports HEAD request on table endpoint + + # assert session_catalog.table_exists(identifier) + + # We expect no error here + session_catalog.drop_table(identifier) + + [email protected] +def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None: + schema = Schema( + NestedField(1, "id", StringType(), required=True), + NestedField(2, "name", StringType(), required=False), + identifier_field_ids=[1], + ) + + data = pa.Table.from_pylist( + [ + {"id": "1", "name": "Amsterdam"}, + {"id": "2", "name": "San Francisco"}, + {"id": "3", "name": "Drachten"}, + ], + schema=schema.as_arrow(), + ) + + identifier = "default.test_overwrite_all_data_with_filter" + tbl = _create_table(session_catalog, identifier, data=[data], schema=schema) + tbl.overwrite(data, In("id", ["1", "2", "3"])) + + assert len(tbl.scan().to_arrow()) == 3
