pitrou commented on code in PR #35750:
URL: https://github.com/apache/arrow/pull/35750#discussion_r1229746591
##########
cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc:
##########
@@ -180,14 +181,102 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
BufferVector{nullptr, out_buffer},
/*null_count=*/0);
}
+template <typename RunEndType>
+Result<std::shared_ptr<ArrayData>> GetTakeIndicesFromREEBitmapImpl(
+ const ArraySpan& filter, FilterOptions::NullSelectionBehavior
null_selection,
+ MemoryPool* memory_pool) {
+ using T = typename RunEndType::c_type;
+ const ArraySpan& filter_values = ::arrow::ree_util::ValuesArray(filter);
+ const int64_t filter_values_offset = filter_values.offset;
+ const uint8_t* filter_is_valid = filter_values.buffers[0].data;
+ const uint8_t* filter_selection = filter_values.buffers[1].data;
+ const bool filter_may_have_nulls = filter_values.MayHaveNulls();
+
+ // BinaryBitBlockCounter is not used here because a REE bitmap, if built
+ // correctly, is not going to have long continuous runs of 0s or 1s in the
+ // values array.
+
+ const ::arrow::ree_util::RunEndEncodedArraySpan<T> filter_span(filter);
+ auto it = filter_span.begin();
+ if (filter_may_have_nulls && null_selection == FilterOptions::EMIT_NULL) {
+ // Most complex case: the filter may have nulls and we don't drop them.
+ // The logic is ternary:
+ // - filter is null: emit null
+ // - filter is valid and true: emit index
+ // - filter is valid and false: don't emit anything
+
+ typename TypeTraits<RunEndType>::BuilderType builder(memory_pool);
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool is_null = !bit_util::GetBit(filter_is_valid,
position_with_offset);
+ if (is_null) {
+ RETURN_NOT_OK(builder.AppendNulls(it.run_length()));
+ } else {
+ const bool emit_run = bit_util::GetBit(filter_selection,
position_with_offset);
+ if (emit_run) {
+ const int64_t run_end = it.run_end();
+ RETURN_NOT_OK(builder.Reserve(run_end - it.logical_position()));
+ for (int64_t position = it.logical_position(); position < run_end;
position++) {
+ builder.UnsafeAppend(static_cast<T>(position));
+ }
+ }
+ }
+ }
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ return result;
+ }
+
+ // Other cases don't emit nulls and are therefore simpler.
+ TypedBufferBuilder<T> builder(memory_pool);
+
+ if (filter_may_have_nulls) {
+ DCHECK_EQ(null_selection, FilterOptions::DROP);
+ // The filter may have nulls, so we scan the validity bitmap and the filter
+ // data bitmap together.
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool emit_run = bit_util::GetBit(filter_is_valid,
position_with_offset) &&
+ bit_util::GetBit(filter_selection,
position_with_offset);
+ if (emit_run) {
+ const int64_t run_end = it.run_end();
+ RETURN_NOT_OK(builder.Reserve(run_end - it.logical_position()));
+ for (int64_t position = it.logical_position(); position < run_end;
position++) {
+ builder.UnsafeAppend(static_cast<T>(position));
+ }
+ }
+ }
+ } else {
+ // The filter has no nulls, so we need only look for true values
+ for (; !it.is_end(filter_span); ++it) {
+ const int64_t position_with_offset = filter_values_offset +
it.index_into_array();
+ const bool emit_run = bit_util::GetBit(filter_selection,
position_with_offset);
Review Comment:
You're right. I was mistakingly assuming it was forbidden in the format.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]