This is an automated email from the ASF dual-hosted git repository.
sungwy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 57043885 Fix: accept empty arrays in struct field lookup (#997)
57043885 is described below
commit 570438850b73af3e72d78563280990ee462e7994
Author: Georg Grob <[email protected]>
AuthorDate: Tue Aug 6 17:12:08 2024 +0100
Fix: accept empty arrays in struct field lookup (#997)
* Fix: accept empty arrays in struct field lookup
Fixes #992.
Empty `pyarrow` arrays are considered falsy, which caused a `ResolveError`
for required fields during scan operations.
* Integration test: empty scan on non-nullable ordered string column
This covers the issue reported in #992 where empty scan queries yielded a
`ResolveError`. Specifically, this occurred under the following conditions:
- a table with an ordered, non-nullable string column
- a scan filtering for a non-existing value _within_ the range of the
values in that particular column
* Lint (add missing newline)
---
dev/provision.py | 10 ++++++++++
pyiceberg/io/pyarrow.py | 2 +-
tests/integration/test_reads.py | 8 ++++++++
3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/dev/provision.py b/dev/provision.py
index 6c8fe366..53360748 100644
--- a/dev/provision.py
+++ b/dev/provision.py
@@ -389,3 +389,13 @@ for catalog_name, catalog in catalogs.items():
VALUES (4)
"""
)
+
+ spark.sql(
+ f"""
+ CREATE OR REPLACE TABLE
{catalog_name}.default.test_empty_scan_ordered_str (id string NOT NULL)
+ USING iceberg
+ TBLPROPERTIES ('format-version'='2')
+ """
+ )
+ spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str
WRITE ORDERED BY id")
+ spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str
VALUES 'a', 'c'")
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 4175f5fe..aefe86ac 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1589,7 +1589,7 @@ class ArrowAccessor(PartnerAccessor[pa.Array]):
return partner
def field_partner(self, partner_struct: Optional[pa.Array], field_id: int,
_: str) -> Optional[pa.Array]:
- if partner_struct:
+ if partner_struct is not None:
# use the field name from the file schema
try:
name = self.file_schema.find_field(field_id).name
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 078abf40..078ec163 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -663,3 +663,11 @@ def test_hive_locking_with_retry(session_catalog_hive:
HiveCatalog) -> None:
table.transaction().set_properties(lock="xxx").commit_transaction()
assert table.properties.get("lock") == "xxx"
+
+
[email protected]
[email protected]("catalog",
[pytest.lazy_fixture("session_catalog_hive"),
pytest.lazy_fixture("session_catalog")])
+def test_empty_scan_ordered_str(catalog: Catalog) -> None:
+ table_empty_scan_ordered_str =
catalog.load_table("default.test_empty_scan_ordered_str")
+ arrow_table = table_empty_scan_ordered_str.scan(EqualTo("id",
"b")).to_arrow()
+ assert len(arrow_table) == 0