bkietz commented on code in PR #43256:
URL: https://github.com/apache/arrow/pull/43256#discussion_r1721995631


##########
cpp/src/arrow/compute/expression.cc:
##########
@@ -1148,6 +1150,71 @@ Result<Expression> Canonicalize(Expression expr, 
compute::ExecContext* exec_cont
 
 namespace {
 
+/// Context for expression simplification.
+struct SimplificationContext {
+  /// Mapping from `is_in` calls to simplified value sets.
+  ///
+  /// `is_in` predicates with large value sets can be expensive to bind, so we
+  /// accumulate simplifications in the context and defer binding until all
+  /// inequalities have been processed.
+  std::unordered_map<const Expression::Call*, std::shared_ptr<Array>> 
is_in_value_sets;
+};
+
+/// Preprocess an `is_in` predicate value set for simplification.
+/// \return the value set sorted with duplicate and null values removed
+Result<std::shared_ptr<Array>> PrepareIsInValueSet(std::shared_ptr<Array> 
value_set) {
+  ARROW_ASSIGN_OR_RAISE(value_set, Unique(std::move(value_set)));
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Array> sort_indices,
+      SortIndices(value_set, SortOptions({}, NullPlacement::AtEnd)));
+  ARROW_ASSIGN_OR_RAISE(
+      value_set,
+      Take(*value_set, *sort_indices, TakeOptions(/*bounds_check=*/false)));
+  if (value_set->length() > 0 && value_set->IsNull(value_set->length() - 1)) {
+    // If the last one is null we know it's the only
+    // one because of the call to `Unique` above.
+    value_set = value_set->Slice(0, value_set->length() - 1);
+  }
+  return value_set;
+}
+
+/// Get the value set for an `is_in` predicate for simplification.
+///
+/// If the `is_in` predicate is being simplified, then we look up the value set
+/// from the simplification context. Otherwise, we return a prepared value set,
+/// memoized in the `is_in_call` kernel state.
+///
+/// \pre `is_in_call` is a call to the `is_in` function
+/// \return the value set to be simplified, guaranteed to be sorted with no
+///         duplicate or null values and cast to the given type
+Result<std::shared_ptr<Array>> GetIsInValueSetForSimplification(
+    const Expression::Call* is_in_call,
+    const TypeHolder& type,
+    SimplificationContext& context) {
+  DCHECK_EQ(is_in_call->function_name, "is_in");
+  std::shared_ptr<Array>& value_set = context.is_in_value_sets[is_in_call];
+  if (!value_set) {
+    // Simplification for `is_in` requires that the value set is preprocessed.
+    // We store the prepared value set in the kernel state to avoid repeated
+    // preprocessing across calls to `SimplifyWithGuarantee`.
+    auto state = checked_pointer_cast<internal::SetLookupStateBase>(
+        is_in_call->kernel_state);
+    if (!state->sorted_and_unique_value_set) {
+      auto options = 
checked_pointer_cast<SetLookupOptions>(is_in_call->options);
+      auto unprepared_value_set = options->value_set.make_array();
+      ARROW_ASSIGN_OR_RAISE(state->sorted_and_unique_value_set,
+                            PrepareIsInValueSet(unprepared_value_set));
+    }
+    if (!state->sorted_and_unique_value_set->type()->Equals(*type)) {
+      ARROW_ASSIGN_OR_RAISE(
+          state->sorted_and_unique_value_set,

Review Comment:
   This is a race condition; a kernel state may be shared across multiple 
threads its Expression was bound in the main thread and then used in a filter 
on multiple worker threads. In general it is not safe to mutate kernel state in 
any way unless you are constructing it as part of a new `Expression::Call` or 
otherwise know that only a single thread accesses this kernel state.
   
   Since this is a property which we expect to be a pure function of 
`value_set`, it would also be **possible** to do this with a memoizing accessor 
like `Result<Datum> SetLookupState::SortedAndUniqueValueSet() const;` (would 
need atomics).
   
   However, I don't see how this adds value over just sorting and unique-ing 
all value sets as part of `SetLookupState::Init()`. To avoid paying for sorting 
and unique-ing when the arrays are already sorted and uniqued, `bool 
SetLookupOptions::value_set_is_sorted_and_unique` could be added (but it'd be 
better if we could add `optional<bool> is_sorted` to 
[ArrayStatistics](https://github.com/apache/arrow/pull/43705))



##########
cpp/src/arrow/compute/expression_test.cc:
##########
@@ -1616,6 +1616,70 @@ TEST(Expression, 
SimplifyWithComparisonAndNullableCaveat) {
           true_unless_null(field_ref("i32"))));  // not satisfiable, will drop 
row group
 }
 
+TEST(Expression, SimplifyIsIn) {
+  auto is_in = [](Expression field, std::shared_ptr<DataType> value_set_type,
+                  std::string json_array) {
+    SetLookupOptions options{ArrayFromJSON(value_set_type, json_array),
+                             SetLookupOptions::MATCH};
+    return call("is_in", {field}, options);
+  };
+
+  Simplify{is_in(field_ref("i32"), int32(), "[]")}
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(false);
+
+  Simplify{is_in(field_ref("i32"), int32(), "[null]")}
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(false);

Review Comment:
   I think this is incorrect; the is_in call is simplified to false but if 
null_matching_behavior=MATCH then `i32 IS IN [null]` should produce a `true` in 
every slot where `i32` was null.



##########
cpp/src/arrow/compute/expression_test.cc:
##########
@@ -1616,6 +1616,70 @@ TEST(Expression, 
SimplifyWithComparisonAndNullableCaveat) {
           true_unless_null(field_ref("i32"))));  // not satisfiable, will drop 
row group
 }
 
+TEST(Expression, SimplifyIsIn) {
+  auto is_in = [](Expression field, std::shared_ptr<DataType> value_set_type,
+                  std::string json_array) {
+    SetLookupOptions options{ArrayFromJSON(value_set_type, json_array),
+                             SetLookupOptions::MATCH};
+    return call("is_in", {field}, options);
+  };
+
+  Simplify{is_in(field_ref("i32"), int32(), "[]")}
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(false);
+
+  Simplify{is_in(field_ref("i32"), int32(), "[null]")}
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(false);

Review Comment:
   I think this is subtle enough to benefit from testing evaluation against 
real data: for some input data (maybe random, just filter by the guarantee to 
produce an array which satisfies the guarantee), evaluate the original 
expression and also the simplified expression then assert that the two results 
are identical.



##########
cpp/src/arrow/compute/expression.cc:
##########
@@ -1242,8 +1309,168 @@ struct Inequality {
                             /*insert_implicit_casts=*/false, &exec_context);
   }
 
+  /// Simplify an `is_in` value set against an inequality guarantee.
+  ///
+  /// Simplifying an `is_in` predicate involves filtering out any values from
+  /// the value set that cannot possibly be found given the guarantee. For
+  /// example, if we have the predicate 'x is_in [1, 2, 3, 4]' and the 
guarantee
+  /// 'x > 2', then the simplified predicate 'x is_in [3, 4]' is equivalent.
+  /// This can be done efficiently if the value set is sorted and unique by
+  /// binary searching the inequality gound and slicing the value set
+  /// accordingly.
+  ///
+  /// \pre `guarantee` is non-nullable
+  /// \pre `guarantee.bound` is a scalar
+  /// \pre `guarantee.bound.type()->id() == value_set->type_id()`
+  /// \return a simplified value set, or a bool if the simplification of the 
value set
+  ///         means the whole is_in expr can become a boolean literal.
+  template <typename ArrowType>
+  static Result<std::variant<std::shared_ptr<Array>, bool>> 
SimplifyIsInValueSet(
+      const Inequality& guarantee, std::shared_ptr<Array> value_set) {
+    using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+    DCHECK(guarantee.bound.is_scalar());
+
+    if (value_set->length() == 0) return false;
+
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar_bound,
+                          guarantee.bound.scalar()->CastTo(value_set->type()));
+    auto bound = internal::UnboxScalar<ArrowType>::Unbox(*scalar_bound);
+    auto compare = [&bound, &value_set](size_t i) -> Comparison::type {
+      DCHECK(value_set->IsValid(i));
+      auto value = checked_pointer_cast<ArrayType>(value_set)->GetView(i);
+      return value == bound ? Comparison::EQUAL
+                            : value < bound ? Comparison::LESS
+                                            : Comparison::GREATER;
+    };
+
+    size_t lo = 0;
+    size_t hi = value_set->length();
+    while (lo + 1 < hi) {
+      size_t mid = (lo + hi) / 2;
+      Comparison::type cmp = compare(mid);
+      if (cmp & Comparison::LESS_EQUAL) {
+        lo = mid;
+      } else {
+        hi = mid;
+      }
+    }
+
+    Comparison::type cmp = compare(lo);
+    size_t pivot = lo + (cmp == Comparison::LESS ? 1 : 0);
+    bool found = cmp == Comparison::EQUAL;
+
+    switch (guarantee.cmp) {
+      case Comparison::EQUAL:
+        return found;
+      case Comparison::LESS:
+        value_set = value_set->Slice(0, pivot);
+        break;
+      case Comparison::LESS_EQUAL:
+        value_set = value_set->Slice(0, pivot + (found ? 1 : 0));

Review Comment:
   These will always slice null out of the value set, which is not correct for 
all null handling behaviors



##########
cpp/src/arrow/compute/expression_test.cc:
##########
@@ -1616,6 +1616,70 @@ TEST(Expression, 
SimplifyWithComparisonAndNullableCaveat) {
           true_unless_null(field_ref("i32"))));  // not satisfiable, will drop 
row group
 }
 
+TEST(Expression, SimplifyIsIn) {
+  auto is_in = [](Expression field, std::shared_ptr<DataType> value_set_type,
+                  std::string json_array) {
+    SetLookupOptions options{ArrayFromJSON(value_set_type, json_array),
+                             SetLookupOptions::MATCH};

Review Comment:
   The PR currently simplifies every `is_in` call but you only test for 
`null_matching_behavior=MATCH`. Please either restrict the simplification to 
`MATCH` or test for all null matching behaviors



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to