bkietz commented on a change in pull request #9444:
URL: https://github.com/apache/arrow/pull/9444#discussion_r572217966



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -94,126 +94,128 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
     const ArrayData& filter, FilterOptions::NullSelectionBehavior 
null_selection,
     MemoryPool* memory_pool) {
   using T = typename IndexType::c_type;
-  typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
 
   const uint8_t* filter_data = filter.buffers[1]->data();
-  BitBlockCounter data_counter(filter_data, filter.offset, filter.length);
+  const bool have_filter_nulls = filter.MayHaveNulls();
+  const uint8_t* filter_is_valid =
+      have_filter_nulls ? filter.buffers[0]->data() : nullptr;

Review comment:
       Could you add a test which exercises the absent validity buffer case?
   ```diff
   --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
   +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
   @@ -62,6 +62,14 @@ TEST(GetTakeIndices, Basics) {
      CheckCase("[]", "[]", FilterOptions::EMIT_NULL);
      CheckCase("[null]", "[null]", FilterOptions::EMIT_NULL);
      CheckCase("[null, false, true, true]", "[null, 2, 3]", 
FilterOptions::EMIT_NULL);
   +
   +  // Test with null validity buffer
   +  const auto& filter = BooleanArray(1, *AllocateEmptyBitmap(1), 
/*null_bitmap=*/nullptr);
   +  auto expected_indices = ArrayFromJSON(uint16(), "[]");
   +
   +  ASSERT_OK_AND_ASSIGN(auto indices,
   +                       internal::GetTakeIndices(*filter.data(), 
FilterOptions::DROP));
   +  AssertArraysEqual(*expected_indices, *MakeArray(indices), 
/*verbose=*/true);
    }
   
    // TODO: Add slicing
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_set_lookup_benchmark.cc
##########
@@ -32,10 +32,10 @@ constexpr auto kSeed = 0x94378165;
 static void SetLookupBenchmarkString(benchmark::State& state,
                                      const std::string& func_name,
                                      const int64_t value_set_length) {
-  const int64_t array_length = 1 << 20;
-  const int64_t value_min_size = 0;
-  const int64_t value_max_size = 32;
-  const double null_probability = 0.01;
+  const int64_t array_length = 1 << 18;
+  const int32_t value_min_size = (value_set_length < 64) ? 2 : 10;
+  const int32_t value_max_size = 32;
+  const double null_probability = 0.2 / value_set_length;

Review comment:
       could you comment on these changes? The impact of variable 
`null_probability` is not clear to me

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -188,17 +263,63 @@ class ARROW_EXPORT BinaryBitBlockCounter {
   /// the number of true values. The last block will have a length less than 64
   /// if the bitmap length is not a multiple of 64, and will return 0-length
   /// blocks in subsequent invocations.
-  BitBlockCount NextAndWord();
+  BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
 
   /// \brief Computes "x | y" block for each available run of bits.
-  BitBlockCount NextOrWord();
+  BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
 
   /// \brief Computes "x | ~y" block for each available run of bits.
-  BitBlockCount NextOrNotWord();
+  BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
 
  private:
   template <template <typename T> class Op>
-  BitBlockCount NextWord();
+  inline BitBlockCount NextWord() {

Review comment:
       Same

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -292,9 +413,10 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
 // Functional-style bit block visitors.
 
 template <typename VisitNotNull, typename VisitNull>
-Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t 
offset,
-                      int64_t length, VisitNotNull&& visit_not_null,
-                      VisitNull&& visit_null) {
+static inline Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf,

Review comment:
       This shouldn't be necessary; templates are always `inline`

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -94,126 +94,128 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
     const ArrayData& filter, FilterOptions::NullSelectionBehavior 
null_selection,
     MemoryPool* memory_pool) {
   using T = typename IndexType::c_type;
-  typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
 
   const uint8_t* filter_data = filter.buffers[1]->data();
-  BitBlockCounter data_counter(filter_data, filter.offset, filter.length);
+  const bool have_filter_nulls = filter.MayHaveNulls();
+  const uint8_t* filter_is_valid =
+      have_filter_nulls ? filter.buffers[0]->data() : nullptr;
 
-  // The position relative to the start of the filter
-  T position = 0;
+  if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
+    // Most complex case: the filter may have nulls and we don't drop them.
+    // The logic is ternary:
+    // - filter is null: emit null
+    // - filter is valid and true: emit index
+    // - filter is valid and false: don't emit anything
 
-  // The current position taking the filter offset into account
-  int64_t position_with_offset = filter.offset;
-  if (filter.MayHaveNulls()) {
-    // The filter may have nulls, so we scan the validity bitmap and the filter
-    // data bitmap together, branching on the null selection type.
-    const uint8_t* filter_is_valid = filter.buffers[0]->data();
+    typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
+
+    // The position relative to the start of the filter
+    T position = 0;
+    // The current position taking the filter offset into account
+    int64_t position_with_offset = filter.offset;
 
-    // To count blocks whether filter_data[i] || !filter_is_valid[i]
+    // To count blocks where filter_data[i] || !filter_is_valid[i]
     BinaryBitBlockCounter filter_counter(filter_data, filter.offset, 
filter_is_valid,
                                          filter.offset, filter.length);
-    if (null_selection == FilterOptions::DROP) {
-      while (position < filter.length) {
-        BitBlockCount and_block = filter_counter.NextAndWord();
-        RETURN_NOT_OK(builder.Reserve(and_block.popcount));
-        if (and_block.AllSet()) {
-          // All the values are selected and non-null
-          for (int64_t i = 0; i < and_block.length; ++i) {
-            builder.UnsafeAppend(position++);
-          }
-          position_with_offset += and_block.length;
-        } else if (!and_block.NoneSet()) {
-          // Some of the values are false or null
-          for (int64_t i = 0; i < and_block.length; ++i) {
-            if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
-                BitUtil::GetBit(filter_data, position_with_offset)) {
-              builder.UnsafeAppend(position);
-            }
-            ++position;
-            ++position_with_offset;
-          }
-        } else {
-          position += and_block.length;
-          position_with_offset += and_block.length;
-        }
+    BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, 
filter.length);
+    while (position < filter.length) {
+      // true OR NOT valid
+      BitBlockCount or_not_block = filter_counter.NextOrNotWord();

Review comment:
       Nit: could we name this more descriptively?
   ```suggestion
         BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to