bkietz commented on code in PR #43256:
URL: https://github.com/apache/arrow/pull/43256#discussion_r1722197161
##########
cpp/src/arrow/compute/expression.cc:
##########
@@ -1148,6 +1150,71 @@ Result<Expression> Canonicalize(Expression expr,
compute::ExecContext* exec_cont
namespace {
+/// Context for expression simplification.
+struct SimplificationContext {
+ /// Mapping from `is_in` calls to simplified value sets.
+ ///
+ /// `is_in` predicates with large value sets can be expensive to bind, so we
+ /// accumulate simplifications in the context and defer binding until all
+ /// inequalities have been processed.
+ std::unordered_map<const Expression::Call*, std::shared_ptr<Array>>
is_in_value_sets;
+};
+
+/// Preprocess an `is_in` predicate value set for simplification.
+/// \return the value set sorted with duplicate and null values removed
+Result<std::shared_ptr<Array>> PrepareIsInValueSet(std::shared_ptr<Array>
value_set) {
+ ARROW_ASSIGN_OR_RAISE(value_set, Unique(std::move(value_set)));
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Array> sort_indices,
+ SortIndices(value_set, SortOptions({}, NullPlacement::AtEnd)));
+ ARROW_ASSIGN_OR_RAISE(
+ value_set,
+ Take(*value_set, *sort_indices, TakeOptions(/*bounds_check=*/false)));
+ if (value_set->length() > 0 && value_set->IsNull(value_set->length() - 1)) {
+ // If the last one is null we know it's the only
+ // one because of the call to `Unique` above.
+ value_set = value_set->Slice(0, value_set->length() - 1);
+ }
+ return value_set;
+}
+
+/// Get the value set for an `is_in` predicate for simplification.
+///
+/// If the `is_in` predicate is being simplified, then we look up the value set
+/// from the simplification context. Otherwise, we return a prepared value set,
+/// memoized in the `is_in_call` kernel state.
+///
+/// \pre `is_in_call` is a call to the `is_in` function
+/// \return the value set to be simplified, guaranteed to be sorted with no
+/// duplicate or null values and cast to the given type
+Result<std::shared_ptr<Array>> GetIsInValueSetForSimplification(
+ const Expression::Call* is_in_call,
+ const TypeHolder& type,
+ SimplificationContext& context) {
+ DCHECK_EQ(is_in_call->function_name, "is_in");
+ std::shared_ptr<Array>& value_set = context.is_in_value_sets[is_in_call];
+ if (!value_set) {
+ // Simplification for `is_in` requires that the value set is preprocessed.
+ // We store the prepared value set in the kernel state to avoid repeated
+ // preprocessing across calls to `SimplifyWithGuarantee`.
+ auto state = checked_pointer_cast<internal::SetLookupStateBase>(
+ is_in_call->kernel_state);
+ if (!state->sorted_and_unique_value_set) {
+ auto options =
checked_pointer_cast<SetLookupOptions>(is_in_call->options);
+ auto unprepared_value_set = options->value_set.make_array();
+ ARROW_ASSIGN_OR_RAISE(state->sorted_and_unique_value_set,
+ PrepareIsInValueSet(unprepared_value_set));
+ }
+ if (!state->sorted_and_unique_value_set->type()->Equals(*type)) {
+ ARROW_ASSIGN_OR_RAISE(
+ state->sorted_and_unique_value_set,
Review Comment:
It's true that `SetLookupOptions::value_set_is_sorted_and_unique` is a bit
odd since it's not used directly by the kernel, which is why I'd prefer that if
we add the optimization that we wait and do so through ArrayStatistics instead
since that doesn't need weird modification of the options for just one function
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]