HonahX commented on code in PR #955:
URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1690505567


##########
pyiceberg/io/pyarrow.py:
##########
@@ -576,11 +572,11 @@ def _convert_scalar(value: Any, iceberg_type: 
IcebergType) -> pa.scalar:
 
 
 class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]):
-    def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> 
pc.Expression:
+    def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> pc.Expression:

Review Comment:
   Thanks for spotting this. I think these are missed from the last major 
typing fix of visitors:
   
   - https://github.com/apache/iceberg/pull/6258#discussion_r1034007197
   - 
https://github.com/apache/iceberg/pull/6308/files#diff-49144c27eab7e03926b0310aa8b513dbeb9c8d2a0d33bacf8dedbd88b4680aac
   
   Given that this is in a concrete class, how about we just use `Any` to 
indicate that we accept general terms?
   ```suggestion
       def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> 
pc.Expression:
   ```
   
   which is also consistent with the type in other methods.



##########
pyiceberg/io/pyarrow.py:
##########
@@ -576,11 +572,11 @@ def _convert_scalar(value: Any, iceberg_type: 
IcebergType) -> pa.scalar:
 
 
 class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]):
-    def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> 
pc.Expression:
+    def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> pc.Expression:
         pyarrow_literals = pa.array(literals, 
type=schema_to_pyarrow(term.ref().field.field_type))
         return pc.field(term.ref().field.name).isin(pyarrow_literals)
 
-    def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) 
-> pc.Expression:
+    def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> 
pc.Expression:

Review Comment:
   ```suggestion
       def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> 
pc.Expression:
   ```



##########
pyiceberg/io/pyarrow.py:
##########
@@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression, 
right_result: pc.Expression) -> p
         return left_result | right_result
 
 
+class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[None]):
+    # BoundTerms which have either is_null or is_not_null appearing at least 
once in the boolean expr.
+    is_null_or_not_bound_terms: set[BoundTerm[Any]]
+    # The remaining BoundTerms appearing in the boolean expr.
+    null_unmentioned_bound_terms: set[BoundTerm[Any]]
+    # BoundTerms which have either is_nan or is_not_nan appearing at least 
once in the boolean expr.
+    is_nan_or_not_bound_terms: set[BoundTerm[Any]]
+    # The remaining BoundTerms appearing in the boolean expr.
+    nan_unmentioned_bound_terms: set[BoundTerm[Any]]
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.is_null_or_not_bound_terms = set()
+        self.null_unmentioned_bound_terms = set()
+        self.is_nan_or_not_bound_terms = set()
+        self.nan_unmentioned_bound_terms = set()
+
+    def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where either is_null or is_not_null is 
included."""
+        if term in self.null_unmentioned_bound_terms:
+            self.null_unmentioned_bound_terms.remove(term)
+        self.is_null_or_not_bound_terms.add(term)
+
+    def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where neither is_null or is_not_null is 
included."""
+        if term not in self.is_null_or_not_bound_terms:
+            self.null_unmentioned_bound_terms.add(term)
+
+    def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where either is_nan or is_not_nan is 
included."""
+        if term in self.nan_unmentioned_bound_terms:
+            self.nan_unmentioned_bound_terms.remove(term)
+        self.is_nan_or_not_bound_terms.add(term)
+
+    def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where neither is_nan or is_not_nan is 
included."""
+        if term not in self.is_nan_or_not_bound_terms:
+            self.nan_unmentioned_bound_terms.add(term)
+
+    def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> None:

Review Comment:
   ```suggestion
       def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None:
   ```



##########
pyiceberg/io/pyarrow.py:
##########
@@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression, 
right_result: pc.Expression) -> p
         return left_result | right_result
 
 
+class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[None]):
+    # BoundTerms which have either is_null or is_not_null appearing at least 
once in the boolean expr.
+    is_null_or_not_bound_terms: set[BoundTerm[Any]]
+    # The remaining BoundTerms appearing in the boolean expr.
+    null_unmentioned_bound_terms: set[BoundTerm[Any]]
+    # BoundTerms which have either is_nan or is_not_nan appearing at least 
once in the boolean expr.
+    is_nan_or_not_bound_terms: set[BoundTerm[Any]]
+    # The remaining BoundTerms appearing in the boolean expr.
+    nan_unmentioned_bound_terms: set[BoundTerm[Any]]
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.is_null_or_not_bound_terms = set()
+        self.null_unmentioned_bound_terms = set()
+        self.is_nan_or_not_bound_terms = set()
+        self.nan_unmentioned_bound_terms = set()
+
+    def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where either is_null or is_not_null is 
included."""
+        if term in self.null_unmentioned_bound_terms:
+            self.null_unmentioned_bound_terms.remove(term)
+        self.is_null_or_not_bound_terms.add(term)
+
+    def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where neither is_null or is_not_null is 
included."""
+        if term not in self.is_null_or_not_bound_terms:
+            self.null_unmentioned_bound_terms.add(term)
+
+    def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where either is_nan or is_not_nan is 
included."""
+        if term in self.nan_unmentioned_bound_terms:
+            self.nan_unmentioned_bound_terms.remove(term)
+        self.is_nan_or_not_bound_terms.add(term)
+
+    def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None:
+        """Handle the predicate case where neither is_nan or is_not_nan is 
included."""
+        if term not in self.is_nan_or_not_bound_terms:
+            self.nan_unmentioned_bound_terms.add(term)
+
+    def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> None:
+        self._handle_null_unmentioned(term)
+        self._handle_nan_unmentioned(term)
+
+    def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> None:

Review Comment:
   ```suggestion
       def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None:
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to