bkietz commented on code in PR #36739:
URL: https://github.com/apache/arrow/pull/36739#discussion_r1330188729
##########
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc:
##########
@@ -274,10 +280,12 @@ struct IndexInVisitor {
const auto& state = checked_cast<const
SetLookupState<NullType>&>(*ctx->state());
Review Comment:
Would you mind modifying this visit overload to:
```diff
- Status Visit(const DataType& type) {
- DCHECK_EQ(type.id(), Type::NA);
+ Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type;
}
+ Status Visit(const NullType&) {
```
##########
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc:
##########
@@ -379,49 +388,82 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan&
batch, ExecResult* out) {
return IndexInVisitor(ctx, batch[0].array,
out->array_span_mutable()).Execute();
}
-// ----------------------------------------------------------------------
-
// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArraySpan& data;
ArraySpan* out;
+ uint8_t* out_boolean_bitmap;
+ uint8_t* out_null_bitmap;
IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
- : ctx(ctx), data(data), out(out) {}
+ : ctx(ctx),
+ data(data),
+ out(out),
+ out_boolean_bitmap(out->buffers[1].data),
+ out_null_bitmap(out->buffers[0].data) {}
Status Visit(const DataType& type) {
Review Comment:
Same here
##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -268,19 +268,45 @@ class ARROW_EXPORT ExtractRegexOptions : public
FunctionOptions {
/// Options for IsIn and IndexIn functions
class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
public:
- explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE };
+
+ explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH);
SetLookupOptions();
+
+ // DEPRECATED(will be removed after removing of skip_nulls)
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls);
+
static constexpr char const kTypeName[] = "SetLookupOptions";
/// The set of values to look up input values into.
Datum value_set;
+
+ /// How to match null values.
+ ///
+ /// Match, any null in `value_set` is successfully matched in
Review Comment:
```suggestion
/// MATCH, any null in `value_set` is successfully matched in
```
##########
cpp/src/arrow/compute/api_scalar.cc:
##########
@@ -344,7 +367,8 @@ static auto kRoundToMultipleOptionsType =
GetFunctionOptionsType<RoundToMultiple
DataMember("round_mode", &RoundToMultipleOptions::round_mode));
static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
DataMember("value_set", &SetLookupOptions::value_set),
- DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+ DataMember("skip_nulls", &SetLookupOptions::skip_nulls),
+ DataMember("null_matching_behavior",
&SetLookupOptions::null_matching_behavior));
Review Comment:
I think it'd be worthwhile to change this to only report/accept assignment
to null_matching_behavior:
```suggestion
CoercedDataMember("null_matching_behavior",
&SetLookupOptions::null_matching_behavior,
&SetLookupOptions::GetNullMatchingBehavior));
```
(This only affects how we deal with SetLookupOptions in reflected contexts
like stringification; we'll still be able to assign to `skip_nulls`.) This can
be enabled by putting the following into `reflection_internal.h`:
```c++
template <typename C, typename T>
struct CoercedDataMemberProperty {
using Class = C;
using Type = T;
constexpr const Type& get(const Class& obj) const { return
obj.*get_coerced_(); }
void set(Class* obj, Type value) const { (*obj).*ptr_for_set_ =
std::move(value); }
constexpr std::string_view name() const { return name_; }
std::string_view name_;
Type Class::*ptr_for_set_;
Type (Class::*get_coerced_)() const;
};
template <typename Class, typename Type>
constexpr CoercedDataMemberProperty<Class, Type>
CoercedDataMember(std::string_view name,
Type
Class::*ptr,
Type
(Class::*get)()
const) {
return {name, ptr, get};
}
```
##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -268,19 +268,45 @@ class ARROW_EXPORT ExtractRegexOptions : public
FunctionOptions {
/// Options for IsIn and IndexIn functions
class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
public:
- explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE };
+
+ explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH);
SetLookupOptions();
+
+ // DEPRECATED(will be removed after removing of skip_nulls)
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls);
+
static constexpr char const kTypeName[] = "SetLookupOptions";
/// The set of values to look up input values into.
Datum value_set;
+
+ /// How to match null values.
+ ///
+ /// Match, any null in `value_set` is successfully matched in
+ /// the input.
+ /// SKIP, any null in `value_set` is ignored and nulls in the input
+ /// produce null (IndexIn) or false (IsIn) values in the output.
+ /// EMIT_NULL, any null in `value_set` is ignored and nulls in the
+ /// input produce null (IndexIn and IsIn) values in the output.
+ /// INCONCLUSIVE, null values are regard as unknown values, which is
Review Comment:
```suggestion
/// INCONCLUSIVE, null values are regarded as unknown values, which is
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]