(iceberg-python) branch main updated: Use a balanced tree instead of unbalanced one (#1830)

fokko Mon, 31 Mar 2025 05:25:00 -0700

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new bae62dfe Use a balanced tree instead of unbalanced one (#1830)
bae62dfe is described below

commit bae62dfee6bbd92d9e633e26053a506817c29dbb
Author: koenvo <[email protected]>
AuthorDate: Mon Mar 31 14:24:48 2025 +0200

    Use a balanced tree instead of unbalanced one (#1830)
    
    **Use a balanced tree instead of an unbalanced one to prevent recursion
    error in `create_match_filter`**
    
    <!-- Closes #1776 -->
    
    ## Rationale for this change
    
    In the `create_match_filter` function, the previous implementation used
    `functools.reduce(operator.or_, filters)` to combine expressions. This
    approach constructed a right-heavy, unbalanced tree, which could lead to
    a `RecursionError` when dealing with a large number of expressions
    (e.g., over 1,000).
    
    To address this, we've introduced the `_build_balanced_tree` function.
    This utility constructs a balanced binary tree of expressions, reducing
    the maximum depth to O(log n) and thereby preventing potential recursion
    errors. This makes expression construction more stable and scalable,
    especially when working with large datasets.
    
    ```python
    
    # Past behavior
    
    Or(*[A, B, C, D]) = Or(A, Or(B, Or(C, D))
    
    # New behavior
    Or(*[A, B, C, D]) = Or(Or(A, B), Or(C, D))
    
    ```
    
    ## Are these changes tested?
    
    Yes, existing tests cover the functionality of `Or`. Additional testing
    was done with large expression sets (e.g., 10,000 items) to ensure that
    balanced tree construction avoids recursion errors.
    
    ## Are there any user-facing changes?
    
    No, there are no user-facing changes. This is an internal implementation
    improvement that does not affect the public API.
    
    Closes #1759
    Closes #1785
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/expressions/__init__.py     | 47 +++++++++++++++++++++++++++++++---
 pyiceberg/table/upsert_util.py        |  8 +++++-
 tests/expressions/test_expressions.py |  4 +--
 tests/expressions/test_visitors.py    | 48 +++++++++++++++++------------------
 tests/io/test_pyarrow_visitor.py      |  2 +-
 5 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/pyiceberg/expressions/__init__.py 
b/pyiceberg/expressions/__init__.py
index 8b006a28..2adf898f 100644
--- a/pyiceberg/expressions/__init__.py
+++ b/pyiceberg/expressions/__init__.py
@@ -18,11 +18,13 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from functools import cached_property, reduce
+from functools import cached_property
 from typing import (
     Any,
+    Callable,
     Generic,
     Iterable,
+    Sequence,
     Set,
     Tuple,
     Type,
@@ -79,6 +81,45 @@ class BooleanExpression(ABC):
         return Or(self, other)
 
 
+def _build_balanced_tree(
+    operator_: Callable[[BooleanExpression, BooleanExpression], 
BooleanExpression], items: Sequence[BooleanExpression]
+) -> BooleanExpression:
+    """
+    Recursively constructs a balanced binary tree of BooleanExpressions using 
the provided binary operator.
+
+    This function is a safer and more scalable alternative to:
+        reduce(operator_, items)
+
+    Using `reduce` creates a deeply nested, unbalanced tree (e.g., 
operator_(a, operator_(b, operator_(c, ...)))),
+    which grows linearly with the number of items. This can lead to 
RecursionError exceptions in Python
+    when the number of expressions is large (e.g., >1000).
+
+    In contrast, this function builds a balanced binary tree with logarithmic 
depth (O(log n)),
+    helping avoid recursion issues and ensuring that expression trees remain 
stable, predictable,
+    and safe to traverse — especially in tools like PyIceberg that operate on 
large logical trees.
+
+    Parameters:
+        operator_ (Callable): A binary operator function (e.g., 
pyiceberg.expressions.Or, And) that takes two
+            BooleanExpressions and returns a combined BooleanExpression.
+        items (Sequence[BooleanExpression]): A sequence of BooleanExpression 
objects to combine.
+
+    Returns:
+        BooleanExpression: The balanced combination of all input 
BooleanExpressions.
+
+    Raises:
+        ValueError: If the input sequence is empty.
+    """
+    if not items:
+        raise ValueError("No expressions to combine")
+    if len(items) == 1:
+        return items[0]
+    mid = len(items) // 2
+
+    left = _build_balanced_tree(operator_, items[:mid])
+    right = _build_balanced_tree(operator_, items[mid:])
+    return operator_(left, right)
+
+
 class Term(Generic[L], ABC):
     """A simple expression that evaluates to a value."""
 
@@ -214,7 +255,7 @@ class And(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: 
BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(And, (left, right, *rest))
+            return _build_balanced_tree(And, (left, right, *rest))
         if left is AlwaysFalse() or right is AlwaysFalse():
             return AlwaysFalse()
         elif left is AlwaysTrue():
@@ -257,7 +298,7 @@ class Or(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: 
BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(Or, (left, right, *rest))
+            return _build_balanced_tree(Or, (left, right, *rest))
         if left is AlwaysTrue() or right is AlwaysTrue():
             return AlwaysTrue()
         elif left is AlwaysFalse():
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
index d2bd48bc..e67f6c02 100644
--- a/pyiceberg/table/upsert_util.py
+++ b/pyiceberg/table/upsert_util.py
@@ -26,6 +26,7 @@ from pyiceberg.expressions import (
     BooleanExpression,
     EqualTo,
     In,
+    Or,
 )
 
 
@@ -39,7 +40,12 @@ def create_match_filter(df: pyarrow_table, join_cols: 
list[str]) -> BooleanExpre
             functools.reduce(operator.and_, [EqualTo(col, row[col]) for col in 
join_cols]) for row in unique_keys.to_pylist()
         ]
 
-        return AlwaysFalse() if len(filters) == 0 else 
functools.reduce(operator.or_, filters)
+        if len(filters) == 0:
+            return AlwaysFalse()
+        elif len(filters) == 1:
+            return filters[0]
+        else:
+            return Or(*filters)
 
 
 def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:
diff --git a/tests/expressions/test_expressions.py 
b/tests/expressions/test_expressions.py
index 12d9ff95..858a9ff8 100644
--- a/tests/expressions/test_expressions.py
+++ b/tests/expressions/test_expressions.py
@@ -591,11 +591,11 @@ def test_negate(lhs: BooleanExpression, rhs: 
BooleanExpression) -> None:
     [
         (
             And(ExpressionA(), ExpressionB(), ExpressionA()),
-            And(And(ExpressionA(), ExpressionB()), ExpressionA()),
+            And(ExpressionA(), And(ExpressionB(), ExpressionA())),
         ),
         (
             Or(ExpressionA(), ExpressionB(), ExpressionA()),
-            Or(Or(ExpressionA(), ExpressionB()), ExpressionA()),
+            Or(ExpressionA(), Or(ExpressionB(), ExpressionA())),
         ),
         (Not(Not(ExpressionA())), ExpressionA()),
     ],
diff --git a/tests/expressions/test_visitors.py 
b/tests/expressions/test_visitors.py
index 94bfcf07..586ba9f5 100644
--- a/tests/expressions/test_visitors.py
+++ b/tests/expressions/test_visitors.py
@@ -230,14 +230,14 @@ def test_boolean_expression_visitor() -> None:
         "NOT",
         "OR",
         "EQUALTO",
-        "OR",
         "NOTEQUALTO",
         "OR",
+        "OR",
         "EQUALTO",
         "NOT",
-        "AND",
         "NOTEQUALTO",
         "AND",
+        "AND",
     ]
 
 
@@ -335,14 +335,14 @@ def 
test_always_false_or_always_true_expression_binding(table_schema_simple: Sch
                 ),
             ),
             And(
-                And(
-                    BoundIn(
-                        BoundReference(
-                            field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
-                            accessor=Accessor(position=0, inner=None),
-                        ),
-                        {literal("bar"), literal("baz")},
+                BoundIn(
+                    BoundReference(
+                        field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
+                        accessor=Accessor(position=0, inner=None),
                     ),
+                    {literal("bar"), literal("baz")},
+                ),
+                And(
                     BoundEqualTo[int](
                         BoundReference(
                             field=NestedField(field_id=2, name="bar", 
field_type=IntegerType(), required=True),
@@ -350,13 +350,13 @@ def 
test_always_false_or_always_true_expression_binding(table_schema_simple: Sch
                         ),
                         literal(1),
                     ),
-                ),
-                BoundEqualTo(
-                    BoundReference(
-                        field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
-                        accessor=Accessor(position=0, inner=None),
+                    BoundEqualTo(
+                        BoundReference(
+                            field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
+                            accessor=Accessor(position=0, inner=None),
+                        ),
+                        literal("baz"),
                     ),
-                    literal("baz"),
                 ),
             ),
         ),
@@ -408,28 +408,28 @@ def test_and_expression_binding(
                 ),
             ),
             Or(
+                BoundIn(
+                    BoundReference(
+                        field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
+                        accessor=Accessor(position=0, inner=None),
+                    ),
+                    {literal("bar"), literal("baz")},
+                ),
                 Or(
                     BoundIn(
                         BoundReference(
                             field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
                             accessor=Accessor(position=0, inner=None),
                         ),
-                        {literal("bar"), literal("baz")},
+                        {literal("bar")},
                     ),
                     BoundIn(
                         BoundReference(
                             field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
                             accessor=Accessor(position=0, inner=None),
                         ),
-                        {literal("bar")},
-                    ),
-                ),
-                BoundIn(
-                    BoundReference(
-                        field=NestedField(field_id=1, name="foo", 
field_type=StringType(), required=False),
-                        accessor=Accessor(position=0, inner=None),
+                        {literal("baz")},
                     ),
-                    {literal("baz")},
                 ),
             ),
         ),
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
index 9f5aff3f..9d5772d0 100644
--- a/tests/io/test_pyarrow_visitor.py
+++ b/tests/io/test_pyarrow_visitor.py
@@ -836,5 +836,5 @@ def test_expression_to_complementary_pyarrow(
     # Notice an isNan predicate on a str column is automatically converted to 
always false and removed from Or and thus will not appear in the pc.expr.
     assert (
         repr(result)
-        == """<pyarrow.compute.Expression (((invert((((((string_field == 
"hello") and (float_field > 100)) or (is_nan(float_field) and (double_field == 
0))) or (float_field > 100)) and invert(is_null(double_field, 
{nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or 
is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
+        == """<pyarrow.compute.Expression (((invert(((((string_field == 
"hello") and (float_field > 100)) or ((is_nan(float_field) and (double_field == 
0)) or (float_field > 100))) and invert(is_null(double_field, 
{nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or 
is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>"""
     )

(iceberg-python) branch main updated: Use a balanced tree instead of unbalanced one (#1830)

Reply via email to