Re: [PR] fix: upsert with null values in join columns [iceberg-python]

via GitHub Fri, 17 Oct 2025 23:48:04 -0700


kevinjqliu commented on code in PR #2429:
URL: https://github.com/apache/iceberg-python/pull/2429#discussion_r2373312577



##########
tests/table/test_upsert.py:
##########
@@ -440,6 +441,70 @@ def test_create_match_filter_single_condition() -> None:
     )
 
 
[email protected](
+    "data, expected",
+    [
+        pytest.param(
+            [{"x": 1.0}, {"x": 2.0}, {"x": None}, {"x": 4.0}, {"x": 
float("nan")}],
+            Or(
+                left=IsNull(term=Reference(name="x")),
+                right=Or(
+                    left=IsNaN(term=Reference(name="x")),
+                    right=In(Reference(name="x"), {DoubleLiteral(1.0), 
DoubleLiteral(2.0), DoubleLiteral(4.0)}),
+                ),
+            ),
+            id="single-column",
+        ),
+        pytest.param(
+            [
+                {"x": 1.0, "y": 9.0},
+                {"x": 2.0, "y": None},
+                {"x": None, "y": 7.0},
+                {"x": 4.0, "y": float("nan")},
+                {"x": float("nan"), "y": 0.0},
+            ],
+            Or(
+                left=Or(
+                    left=And(
+                        left=EqualTo(term=Reference(name="x"), 
literal=DoubleLiteral(1.0)),
+                        right=EqualTo(term=Reference(name="y"), 
literal=DoubleLiteral(9.0)),
+                    ),
+                    right=And(
+                        left=EqualTo(term=Reference(name="x"), 
literal=DoubleLiteral(2.0)),
+                        right=IsNull(term=Reference(name="y")),
+                    ),
+                ),
+                right=Or(
+                    left=And(
+                        left=IsNull(term=Reference(name="x")),
+                        right=EqualTo(term=Reference(name="y"), 
literal=DoubleLiteral(7.0)),
+                    ),
+                    right=Or(
+                        left=And(
+                            left=EqualTo(term=Reference(name="x"), 
literal=DoubleLiteral(4.0)),
+                            right=IsNaN(term=Reference(name="y")),
+                        ),
+                        right=And(
+                            left=IsNaN(term=Reference(name="x")),
+                            right=EqualTo(term=Reference(name="y"), 
literal=DoubleLiteral(0.0)),
+                        ),
+                    ),
+                ),
+            ),
+            id="multi-column",
+        ),
+    ],
+)
+def test_create_match_filter_with_nulls(data: list[dict[str, Any]], expected: 
BooleanExpression) -> None:

Review Comment:
   nit: what is this testing for here? the parameterized test makes it hard to 
read



##########
pyiceberg/table/upsert_util.py:
##########
@@ -14,38 +14,61 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import functools
-import operator
+from math import isnan
+from typing import Any
 
 import pyarrow as pa
 from pyarrow import Table as pyarrow_table
 from pyarrow import compute as pc
 
 from pyiceberg.expressions import (
     AlwaysFalse,
+    And,
     BooleanExpression,
     EqualTo,
     In,
+    IsNaN,
+    IsNull,
     Or,
 )
 
 
 def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> 
BooleanExpression:
     unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
+    filters: list[BooleanExpression] = []
 
     if len(join_cols) == 1:
-        return In(join_cols[0], unique_keys[0].to_pylist())
+        column = join_cols[0]

Review Comment:
   is there a way we can simplify the logic here? 
   
   i think the primary issue is that the `In` operator cannot handle Null, is 
that right? 



##########
tests/table/test_upsert.py:
##########
@@ -710,6 +775,46 @@ def test_upsert_with_nulls(catalog: Catalog) -> None:
         schema=schema,
     )
 
+    # upsert table with null value
+    data_with_null = pa.Table.from_pylist(

Review Comment:
   nit: lets put this in its own test function so its clear what case we're 
testing for



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] fix: upsert with null values in join columns [iceberg-python]

Reply via email to