This is an automated email from the ASF dual-hosted git repository. sungwy pushed a commit to branch pyiceberg-0.7.x in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
commit d1d6168886b5099528ca6f0d8fcd1f9ca5c1bb9d Author: Georg Grob <[email protected]> AuthorDate: Tue Aug 6 17:12:08 2024 +0100 Fix: accept empty arrays in struct field lookup (#997) * Fix: accept empty arrays in struct field lookup Fixes #992. Empty `pyarrow` arrays are considered falsy, which caused a `ResolveError` for required fields during scan operations. * Integration test: empty scan on non-nullable ordered string column This covers the issue reported in #992 where empty scan queries yielded a `ResolveError`. Specifically, this occurred under the following conditions: - a table with an ordered, non-nullable string column - a scan filtering for a non-existing value _within_ the range of the values in that particular column * Lint (add missing newline) --- dev/provision.py | 10 ++++++++++ pyiceberg/io/pyarrow.py | 2 +- tests/integration/test_reads.py | 8 ++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/dev/provision.py b/dev/provision.py index 6c8fe366..53360748 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -389,3 +389,13 @@ for catalog_name, catalog in catalogs.items(): VALUES (4) """ ) + + spark.sql( + f""" + CREATE OR REPLACE TABLE {catalog_name}.default.test_empty_scan_ordered_str (id string NOT NULL) + USING iceberg + TBLPROPERTIES ('format-version'='2') + """ + ) + spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id") + spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'") diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index f3b85eb4..f8e54081 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1590,7 +1590,7 @@ class ArrowAccessor(PartnerAccessor[pa.Array]): return partner def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: str) -> Optional[pa.Array]: - if partner_struct: + if partner_struct is not None: # use the field name from the file schema try: name = self.file_schema.find_field(field_id).name diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 078abf40..078ec163 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -663,3 +663,11 @@ def test_hive_locking_with_retry(session_catalog_hive: HiveCatalog) -> None: table.transaction().set_properties(lock="xxx").commit_transaction() assert table.properties.get("lock") == "xxx" + + [email protected] [email protected]("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_empty_scan_ordered_str(catalog: Catalog) -> None: + table_empty_scan_ordered_str = catalog.load_table("default.test_empty_scan_ordered_str") + arrow_table = table_empty_scan_ordered_str.scan(EqualTo("id", "b")).to_arrow() + assert len(arrow_table) == 0
