felipecrv commented on code in PR #43256:
URL: https://github.com/apache/arrow/pull/43256#discussion_r1685773084
##########
cpp/src/arrow/compute/expression.cc:
##########
@@ -1242,33 +1242,104 @@ struct Inequality {
/*insert_implicit_casts=*/false, &exec_context);
}
+ /// Simplify an is_in predicate against this inequality as a guarantee.
+ Result<Expression> SimplifyIsIn(Expression expr) {
+ const auto& guarantee = *this;
+ auto call = expr.call();
+ auto options = checked_pointer_cast<SetLookupOptions>(call->options);
+
+ auto value_set = options->value_set.make_array();
+ if (!value_set) return expr;
+ if (value_set->length() == 0) return literal(false);
+
+ if (!options->sorted_and_deduped) return expr;
+
+ // For now, only simplify when the guarantee is non-nullable.
+ if (guarantee.nullable) return expr;
+
+ auto compare = [&value_set, &guarantee](size_t i) ->
Result<Comparison::type> {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
value_set->GetScalar(i));
+ // Nulls compare greater than any non-null value.
+ if (!scalar->is_valid) {
+ return Comparison::GREATER;
+ }
+ ARROW_ASSIGN_OR_RAISE(Comparison::type cmp,
+ Comparison::Execute(scalar, guarantee.bound));
+ return cmp;
+ };
+
+ size_t lo = 0;
+ size_t hi = value_set->length();
+ while (lo + 1 < hi) {
+ size_t mid = (lo + hi) / 2;
+ ARROW_ASSIGN_OR_RAISE(Comparison::type cmp, compare(mid));
+ if (cmp & Comparison::LESS_EQUAL) {
+ lo = mid;
+ } else {
+ hi = mid;
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Comparison::type cmp, compare(lo));
+ size_t pivot = lo + (cmp == Comparison::LESS ? 1 : 0);
+ bool found = cmp == Comparison::EQUAL;
+
+ std::shared_ptr<Array> simplified_value_set;
+ if (guarantee.cmp == Comparison::EQUAL) {
+ return literal(found);
+ }
+ if (guarantee.cmp == Comparison::LESS) {
+ simplified_value_set = value_set->Slice(0, pivot);
+ } else if (guarantee.cmp == Comparison::LESS_EQUAL) {
+ simplified_value_set = value_set->Slice(0, pivot + (found ? 1 : 0));
+ } else if (guarantee.cmp == Comparison::GREATER) {
+ simplified_value_set = value_set->Slice(pivot + (found ? 1 : 0));
+ } else if (guarantee.cmp == Comparison::GREATER_EQUAL) {
+ simplified_value_set = value_set->Slice(pivot);
+ } else {
+ // We should never reach here.
+ return expr;
+ }
Review Comment:
I believe you're assuming that `value_set` doesn't contain a null as well.
That needs to be (1) checked and (2) documented in this list of pre-conditions.
This is why I think it's probably better to sort and dedup `value_set`
instead of explaining and trusting that callers will do the right thing. When
nulls are involved "sorted" could mean they are in the beginning or in the end.
It's hard to define "sorted" as a pre-condition without getting into a lot of
details regarding ordering.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]