pitrou commented on code in PR #35750:
URL: https://github.com/apache/arrow/pull/35750#discussion_r1229706395
##########
cpp/src/arrow/compute/kernels/vector_selection_test.cc:
##########
@@ -42,99 +42,161 @@ using std::string_view;
namespace compute {
+namespace {
+
+template <typename T>
+Result<std::shared_ptr<Array>> REEncode(const T& array) {
+ ARROW_ASSIGN_OR_RAISE(auto datum, RunEndEncode(array));
+ return datum.make_array();
+}
+
+Result<std::shared_ptr<Array>> REEFromJson(const std::shared_ptr<DataType>&
ree_type,
Review Comment:
Nit: use same casing as existing helpers?
```suggestion
Result<std::shared_ptr<Array>> REEFromJSON(const std::shared_ptr<DataType>&
ree_type,
```
##########
cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc:
##########
@@ -180,14 +181,102 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
BufferVector{nullptr, out_buffer},
/*null_count=*/0);
}
+template <typename RunEndType>
+Result<std::shared_ptr<ArrayData>> GetTakeIndicesFromREEBitmapImpl(
+ const ArraySpan& filter, FilterOptions::NullSelectionBehavior
null_selection,
+ MemoryPool* memory_pool) {
+ using T = typename RunEndType::c_type;
+ const ArraySpan& filter_values = ::arrow::ree_util::ValuesArray(filter);
+ const int64_t filter_values_offset = filter_values.offset;
+ const uint8_t* filter_is_valid = filter_values.buffers[0].data;
+ const uint8_t* filter_selection = filter_values.buffers[1].data;
+ const bool filter_may_have_nulls = filter_values.MayHaveNulls();
+
+ // BinaryBitBlockCounter is not used here because a REE bitmap, if built
+ // correctly, is not going to have long continuous runs of 0s or 1s in the
+ // values array.
+
+ const ::arrow::ree_util::RunEndEncodedArraySpan<T> filter_span(filter);
+ auto it = filter_span.begin();
+ if (filter_may_have_nulls && null_selection == FilterOptions::EMIT_NULL) {
+ // Most complex case: the filter may have nulls and we don't drop them.
+ // The logic is ternary:
+ // - filter is null: emit null
+ // - filter is valid and true: emit index
+ // - filter is valid and false: don't emit anything
+
+ typename TypeTraits<RunEndType>::BuilderType builder(memory_pool);
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool is_null = !bit_util::GetBit(filter_is_valid,
position_with_offset);
+ if (is_null) {
+ RETURN_NOT_OK(builder.AppendNulls(it.run_length()));
+ } else {
+ const bool emit_run = bit_util::GetBit(filter_selection,
position_with_offset);
+ if (emit_run) {
+ const int64_t run_end = it.run_end();
+ RETURN_NOT_OK(builder.Reserve(run_end - it.logical_position()));
+ for (int64_t position = it.logical_position(); position < run_end;
position++) {
+ builder.UnsafeAppend(static_cast<T>(position));
+ }
+ }
+ }
+ }
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ return result;
+ }
+
+ // Other cases don't emit nulls and are therefore simpler.
+ TypedBufferBuilder<T> builder(memory_pool);
+
+ if (filter_may_have_nulls) {
+ DCHECK_EQ(null_selection, FilterOptions::DROP);
+ // The filter may have nulls, so we scan the validity bitmap and the filter
+ // data bitmap together.
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool emit_run = bit_util::GetBit(filter_is_valid,
position_with_offset) &&
+ bit_util::GetBit(filter_selection,
position_with_offset);
+ if (emit_run) {
+ const int64_t run_end = it.run_end();
+ RETURN_NOT_OK(builder.Reserve(run_end - it.logical_position()));
+ for (int64_t position = it.logical_position(); position < run_end;
position++) {
+ builder.UnsafeAppend(static_cast<T>(position));
+ }
+ }
+ }
+ } else {
+ // The filter has no nulls, so we need only look for true values
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool emit_run = bit_util::GetBit(filter_selection,
position_with_offset);
Review Comment:
By the way, given the definition of REE, the run values should alternate
between true and false. This might help make this loop faster (but might not).
It can be left for a subsequent PR or GH issue.
##########
cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:
##########
@@ -444,196 +518,244 @@ Status PrimitiveFilterExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult
// Optimized binary filter for the case where neither values nor filter have
// nulls
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior
null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
- const auto filter_data = filter.buffers[1].data;
+ using offset_type = typename ArrowType::offset_type;
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t
length) {
- // Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
- // Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
- }
- return Status::OK();
- }));
+ auto emit_segment = [&](int64_t position, int64_t length) {
+ // Bulk-append raw data
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ // Append offsets
+ for (int64_t i = 0; i < length; ++i) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[i + position + 1] - raw_offsets[i + position + 1];
+ }
+ return Status::OK();
+ };
+ if (is_ree_filter) {
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
Review Comment:
Also, shouldn't its value be false?
##########
cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:
##########
@@ -444,196 +518,244 @@ Status PrimitiveFilterExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult
// Optimized binary filter for the case where neither values nor filter have
// nulls
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior
null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
- const auto filter_data = filter.buffers[1].data;
+ using offset_type = typename ArrowType::offset_type;
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t
length) {
- // Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
- // Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
- }
- return Status::OK();
- }));
+ auto emit_segment = [&](int64_t position, int64_t length) {
+ // Bulk-append raw data
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ // Append offsets
+ for (int64_t i = 0; i < length; ++i) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[i + position + 1] - raw_offsets[i + position + 1];
+ }
+ return Status::OK();
+ };
+ if (is_ree_filter) {
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
+ [&status, emit_segment = std::move(emit_segment)](
+ int64_t position, int64_t segment_length, bool filter_valid) {
+ DCHECK(filter_valid);
+ status = emit_segment(position, segment_length);
+ return status.ok();
+ });
+ RETURN_NOT_OK(std::move(status));
+ } else {
+ const auto filter_data = filter.buffers[1].data;
+ RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, std::move(emit_segment)));
+ }
offset_builder.UnsafeAppend(offset);
out->length = output_length;
RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
return data_builder.Finish(&out->buffers[2]);
}
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
+ using offset_type = typename ArrowType::offset_type;
+
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
- const auto filter_data = filter.buffers[1].data;
- const uint8_t* filter_is_valid = filter.buffers[0].data;
- const int64_t filter_offset = filter.offset;
+ BINARY_FILTER_SETUP_COMMON();
const uint8_t* values_is_valid = values.buffers[0].data;
const int64_t values_offset = values.offset;
+ const int64_t out_offset = out->offset;
uint8_t* out_is_valid = out->buffers[0]->mutable_data();
// Zero bits and then only have to set valid values to true
- bit_util::SetBitsTo(out_is_valid, 0, output_length, false);
-
- // We use 3 block counters for fast scanning of the filter
- //
- // * values_valid_counter: for values null/not-null
- // * filter_valid_counter: for filter null/not-null
- // * filter_counter: for filter true/false
- OptionalBitBlockCounter values_valid_counter(values_is_valid, values_offset,
- values.length);
- OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
- filter.length);
- BitBlockCounter filter_counter(filter_data, filter_offset, filter.length);
-
- BINARY_FILTER_SETUP_COMMON();
+ bit_util::SetBitsTo(out_is_valid, out_offset, output_length, false);
int64_t in_position = 0;
int64_t out_position = 0;
- while (in_position < filter.length) {
- BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
- BitBlockCount values_valid_block = values_valid_counter.NextWord();
- BitBlockCount filter_block = filter_counter.NextWord();
- if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
- // For this exceedingly common case in low-selectivity filters we can
- // skip further analysis of the data and move on to the next block.
- in_position += filter_block.length;
- } else if (filter_valid_block.AllSet()) {
- // Simpler path: no filter values are null
- if (filter_block.AllSet()) {
- // Fastest path: filter values are all true and not null
- if (values_valid_block.AllSet()) {
- // The values aren't null either
- bit_util::SetBitsTo(out_is_valid, out_position, filter_block.length,
true);
-
- // Bulk-append raw data
- offset_type block_data_bytes =
- (raw_offsets[in_position + filter_block.length] -
raw_offsets[in_position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[in_position],
block_data_bytes);
- // Append offsets
- for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[in_position + 1] - raw_offsets[in_position];
- }
- out_position += filter_block.length;
- } else {
- // Some of the values in this block are null
- for (int64_t i = 0; i < filter_block.length;
- ++i, ++in_position, ++out_position) {
- offset_builder.UnsafeAppend(offset);
- if (bit_util::GetBit(values_is_valid, values_offset +
in_position)) {
- bit_util::SetBit(out_is_valid, out_position);
- APPEND_SINGLE_VALUE();
- }
+ if (is_ree_filter) {
+ auto emit_segment = [&](int64_t position, int64_t segment_length, bool
filter_valid) {
+ in_position = position;
+ if (filter_valid) {
+ // Filter values are all true and not null
+ // Some of the values in the block may be null
+ for (int64_t i = 0; i < segment_length; ++i, ++in_position,
++out_position) {
+ offset_builder.UnsafeAppend(offset);
+ if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+ bit_util::SetBit(out_is_valid, out_offset + out_position);
+ APPEND_SINGLE_VALUE();
}
}
- } else { // !filter_block.AllSet()
- // Some of the filter values are false, but all not null
- if (values_valid_block.AllSet()) {
- // All the values are not-null, so we can skip null checking for
- // them
- for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
- if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+ } else {
+ offset_builder.UnsafeAppend(segment_length, offset);
+ out_position += segment_length;
+ }
+ return Status::OK();
+ };
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
+ [&status, emit_segment = std::move(emit_segment)](
+ int64_t position, int64_t segment_length, bool filter_valid) {
+ status = emit_segment(position, segment_length, filter_valid);
+ return status.ok();
+ });
+ RETURN_NOT_OK(std::move(status));
Review Comment:
The `std::move` is a bit pedantic here IMHO...
##########
cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:
##########
@@ -444,196 +518,244 @@ Status PrimitiveFilterExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult
// Optimized binary filter for the case where neither values nor filter have
// nulls
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior
null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
- const auto filter_data = filter.buffers[1].data;
+ using offset_type = typename ArrowType::offset_type;
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t
length) {
- // Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
- // Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
- }
- return Status::OK();
- }));
+ auto emit_segment = [&](int64_t position, int64_t length) {
+ // Bulk-append raw data
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ // Append offsets
+ for (int64_t i = 0; i < length; ++i) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[i + position + 1] - raw_offsets[i + position + 1];
+ }
+ return Status::OK();
+ };
+ if (is_ree_filter) {
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
+ [&status, emit_segment = std::move(emit_segment)](
+ int64_t position, int64_t segment_length, bool filter_valid) {
+ DCHECK(filter_valid);
+ status = emit_segment(position, segment_length);
+ return status.ok();
+ });
+ RETURN_NOT_OK(std::move(status));
+ } else {
+ const auto filter_data = filter.buffers[1].data;
+ RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, std::move(emit_segment)));
+ }
offset_builder.UnsafeAppend(offset);
out->length = output_length;
RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
return data_builder.Finish(&out->buffers[2]);
}
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
+ using offset_type = typename ArrowType::offset_type;
+
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
- const auto filter_data = filter.buffers[1].data;
- const uint8_t* filter_is_valid = filter.buffers[0].data;
- const int64_t filter_offset = filter.offset;
+ BINARY_FILTER_SETUP_COMMON();
const uint8_t* values_is_valid = values.buffers[0].data;
const int64_t values_offset = values.offset;
+ const int64_t out_offset = out->offset;
uint8_t* out_is_valid = out->buffers[0]->mutable_data();
// Zero bits and then only have to set valid values to true
- bit_util::SetBitsTo(out_is_valid, 0, output_length, false);
-
- // We use 3 block counters for fast scanning of the filter
- //
- // * values_valid_counter: for values null/not-null
- // * filter_valid_counter: for filter null/not-null
- // * filter_counter: for filter true/false
- OptionalBitBlockCounter values_valid_counter(values_is_valid, values_offset,
- values.length);
- OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
- filter.length);
- BitBlockCounter filter_counter(filter_data, filter_offset, filter.length);
-
- BINARY_FILTER_SETUP_COMMON();
+ bit_util::SetBitsTo(out_is_valid, out_offset, output_length, false);
int64_t in_position = 0;
int64_t out_position = 0;
- while (in_position < filter.length) {
- BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
- BitBlockCount values_valid_block = values_valid_counter.NextWord();
- BitBlockCount filter_block = filter_counter.NextWord();
- if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
- // For this exceedingly common case in low-selectivity filters we can
- // skip further analysis of the data and move on to the next block.
- in_position += filter_block.length;
- } else if (filter_valid_block.AllSet()) {
- // Simpler path: no filter values are null
- if (filter_block.AllSet()) {
- // Fastest path: filter values are all true and not null
- if (values_valid_block.AllSet()) {
- // The values aren't null either
- bit_util::SetBitsTo(out_is_valid, out_position, filter_block.length,
true);
-
- // Bulk-append raw data
- offset_type block_data_bytes =
- (raw_offsets[in_position + filter_block.length] -
raw_offsets[in_position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[in_position],
block_data_bytes);
- // Append offsets
- for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[in_position + 1] - raw_offsets[in_position];
- }
- out_position += filter_block.length;
- } else {
- // Some of the values in this block are null
- for (int64_t i = 0; i < filter_block.length;
- ++i, ++in_position, ++out_position) {
- offset_builder.UnsafeAppend(offset);
- if (bit_util::GetBit(values_is_valid, values_offset +
in_position)) {
- bit_util::SetBit(out_is_valid, out_position);
- APPEND_SINGLE_VALUE();
- }
+ if (is_ree_filter) {
+ auto emit_segment = [&](int64_t position, int64_t segment_length, bool
filter_valid) {
+ in_position = position;
+ if (filter_valid) {
+ // Filter values are all true and not null
+ // Some of the values in the block may be null
+ for (int64_t i = 0; i < segment_length; ++i, ++in_position,
++out_position) {
+ offset_builder.UnsafeAppend(offset);
+ if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+ bit_util::SetBit(out_is_valid, out_offset + out_position);
+ APPEND_SINGLE_VALUE();
}
}
- } else { // !filter_block.AllSet()
- // Some of the filter values are false, but all not null
- if (values_valid_block.AllSet()) {
- // All the values are not-null, so we can skip null checking for
- // them
- for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
- if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+ } else {
+ offset_builder.UnsafeAppend(segment_length, offset);
+ out_position += segment_length;
+ }
+ return Status::OK();
+ };
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
Review Comment:
Also include parameter name here.
##########
cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc:
##########
@@ -444,196 +518,244 @@ Status PrimitiveFilterExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult
// Optimized binary filter for the case where neither values nor filter have
// nulls
-template <typename Type>
+template <typename ArrowType>
Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
const ArraySpan& filter, int64_t output_length,
FilterOptions::NullSelectionBehavior
null_selection,
ArrayData* out) {
- using offset_type = typename Type::offset_type;
- const auto filter_data = filter.buffers[1].data;
+ using offset_type = typename ArrowType::offset_type;
+ const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED;
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t
length) {
- // Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
- // Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
- offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
- }
- return Status::OK();
- }));
+ auto emit_segment = [&](int64_t position, int64_t length) {
+ // Bulk-append raw data
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ // Append offsets
+ for (int64_t i = 0; i < length; ++i) {
+ offset_builder.UnsafeAppend(offset);
+ offset += raw_offsets[i + position + 1] - raw_offsets[i + position + 1];
+ }
+ return Status::OK();
+ };
+ if (is_ree_filter) {
+ Status status;
+ VisitPlainxREEFilterOutputSegments(
+ filter, true, null_selection,
Review Comment:
Can you include the actual parameter name for readability?
```suggestion
filter, /*xxx=*/ true, null_selection,
```
##########
cpp/src/arrow/compute/kernels/vector_selection_test.cc:
##########
@@ -42,99 +42,161 @@ using std::string_view;
namespace compute {
+namespace {
+
+template <typename T>
+Result<std::shared_ptr<Array>> REEncode(const T& array) {
+ ARROW_ASSIGN_OR_RAISE(auto datum, RunEndEncode(array));
+ return datum.make_array();
+}
+
+Result<std::shared_ptr<Array>> REEFromJson(const std::shared_ptr<DataType>&
ree_type,
+ const std::string& json) {
+ auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get());
+ auto array = ArrayFromJSON(ree_type_ptr->value_type(), json);
+ ARROW_ASSIGN_OR_RAISE(
+ auto datum, RunEndEncode(array,
RunEndEncodeOptions{ree_type_ptr->run_end_type()}));
+ return datum.make_array();
+}
+
+Result<std::shared_ptr<Array>> FilterFromJson(
+ const std::shared_ptr<DataType>& filter_type, const std::string& json) {
+ if (filter_type->id() == Type::RUN_END_ENCODED) {
+ return REEFromJson(filter_type, json);
+ } else {
+ return ArrayFromJSON(filter_type, json);
+ }
+}
+
+Result<std::shared_ptr<Array>> REEncode(const std::shared_ptr<Array>& array) {
+ ARROW_ASSIGN_OR_RAISE(auto datum, RunEndEncode(array));
+ return datum.make_array();
+}
+
+void CheckTakeCase(const BooleanArray& filter,
+ const std::shared_ptr<Array>& expected_indices,
+ FilterOptions::NullSelectionBehavior null_selection) {
+ ASSERT_OK_AND_ASSIGN(auto indices,
+ internal::GetTakeIndices(*filter.data(),
null_selection));
+ auto indices_array = MakeArray(indices);
+ ValidateOutput(indices);
+ AssertArraysEqual(*expected_indices, *indices_array, /*verbose=*/true);
+
+ ASSERT_OK_AND_ASSIGN(auto ree_filter, REEncode(filter));
+ ASSERT_OK_AND_ASSIGN(auto indices_from_ree,
+ internal::GetTakeIndices(*ree_filter->data(),
null_selection));
+ auto indices_from_ree_array = MakeArray(indices);
+ ValidateOutput(indices_from_ree);
+ AssertArraysEqual(*expected_indices, *indices_from_ree_array,
/*verbose=*/true);
+}
+
+void CheckTakeCase(const std::string& filter_json, const std::string&
indices_json,
Review Comment:
(same here)
##########
cpp/src/arrow/compute/kernels/vector_selection_test.cc:
##########
@@ -42,99 +42,161 @@ using std::string_view;
namespace compute {
+namespace {
+
+template <typename T>
+Result<std::shared_ptr<Array>> REEncode(const T& array) {
+ ARROW_ASSIGN_OR_RAISE(auto datum, RunEndEncode(array));
+ return datum.make_array();
+}
+
+Result<std::shared_ptr<Array>> REEFromJson(const std::shared_ptr<DataType>&
ree_type,
+ const std::string& json) {
+ auto ree_type_ptr = checked_cast<const RunEndEncodedType*>(ree_type.get());
+ auto array = ArrayFromJSON(ree_type_ptr->value_type(), json);
+ ARROW_ASSIGN_OR_RAISE(
+ auto datum, RunEndEncode(array,
RunEndEncodeOptions{ree_type_ptr->run_end_type()}));
+ return datum.make_array();
+}
+
+Result<std::shared_ptr<Array>> FilterFromJson(
+ const std::shared_ptr<DataType>& filter_type, const std::string& json) {
+ if (filter_type->id() == Type::RUN_END_ENCODED) {
+ return REEFromJson(filter_type, json);
+ } else {
+ return ArrayFromJSON(filter_type, json);
+ }
+}
+
+Result<std::shared_ptr<Array>> REEncode(const std::shared_ptr<Array>& array) {
+ ARROW_ASSIGN_OR_RAISE(auto datum, RunEndEncode(array));
+ return datum.make_array();
+}
+
+void CheckTakeCase(const BooleanArray& filter,
Review Comment:
I would name this a bit more precisely.
```suggestion
void CheckTakeIndicesCase(const BooleanArray& filter,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]